From 2b462638e41ea62230297c21c4da9955937b7a3c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Fri, 29 Aug 2014 15:18:58 -0700
Subject: ocfs2: do not write error flag to user structure we cannot copy
 from/to

If we failed to copy from the structure, writing back the flags leaks 31
bits of kernel memory (the rest of the ir_flags field).

In any case, if we cannot copy from/to the structure, why should we
expect putting just the flags to work?

Also make sure ocfs2_info_handle_freeinode() returns the right error
code if the copy_to_user() fails.

Fixes: ddee5cdb70e6 ('Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8.')
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Joel Becker <jlbec@evilplan.org>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ioctl.c | 129 +++++++++++++++++++------------------------------------
 1 file changed, 43 insertions(+), 86 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
 		copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
 
 /*
- * This call is void because we are already reporting an error that may
- * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
- * just a best-effort to tell userspace that this request caused the error.
+ * This is just a best-effort to tell userspace that this request
+ * caused the error.
  */
 static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
 					struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
 static int ocfs2_info_handle_blocksize(struct inode *inode,
 				       struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_blocksize oib;
 
 	if (o2info_from_user(oib, req))
-		goto bail;
+		return -EFAULT;
 
 	oib.ib_blocksize = inode->i_sb->s_blocksize;
 
 	o2info_set_request_filled(&oib.ib_req);
 
 	if (o2info_to_user(oib, req))
-		goto bail;
-
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oib.ib_req, req);
+		return -EFAULT;
 
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_clustersize(struct inode *inode,
 					 struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_clustersize oic;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oic, req))
-		goto bail;
+		return -EFAULT;
 
 	oic.ic_clustersize = osb->s_clustersize;
 
 	o2info_set_request_filled(&oic.ic_req);
 
 	if (o2info_to_user(oic, req))
-		goto bail;
-
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oic.ic_req, req);
+		return -EFAULT;
 
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_maxslots(struct inode *inode,
 				      struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_maxslots oim;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oim, req))
-		goto bail;
+		return -EFAULT;
 
 	oim.im_max_slots = osb->max_slots;
 
 	o2info_set_request_filled(&oim.im_req);
 
 	if (o2info_to_user(oim, req))
-		goto bail;
+		return -EFAULT;
 
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oim.im_req, req);
-
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_label(struct inode *inode,
 				   struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_label oil;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oil, req))
-		goto bail;
+		return -EFAULT;
 
 	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
 
 	o2info_set_request_filled(&oil.il_req);
 
 	if (o2info_to_user(oil, req))
-		goto bail;
+		return -EFAULT;
 
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oil.il_req, req);
-
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_uuid(struct inode *inode,
 				  struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_uuid oiu;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oiu, req))
-		goto bail;
+		return -EFAULT;
 
 	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
 
 	o2info_set_request_filled(&oiu.iu_req);
 
 	if (o2info_to_user(oiu, req))
-		goto bail;
-
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oiu.iu_req, req);
+		return -EFAULT;
 
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_fs_features(struct inode *inode,
 					 struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_fs_features oif;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oif, req))
-		goto bail;
+		return -EFAULT;
 
 	oif.if_compat_features = osb->s_feature_compat;
 	oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
 	o2info_set_request_filled(&oif.if_req);
 
 	if (o2info_to_user(oif, req))
-		goto bail;
+		return -EFAULT;
 
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oif.if_req, req);
-
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_handle_journal_size(struct inode *inode,
 					  struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_journal_size oij;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (o2info_from_user(oij, req))
-		goto bail;
+		return -EFAULT;
 
 	oij.ij_journal_size = i_size_read(osb->journal->j_inode);
 
 	o2info_set_request_filled(&oij.ij_req);
 
 	if (o2info_to_user(oij, req))
-		goto bail;
+		return -EFAULT;
 
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oij.ij_req, req);
-
-	return status;
+	return 0;
 }
 
 static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
 	u32 i;
 	u64 blkno = -1;
 	char namebuf[40];
-	int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+	int status, type = INODE_ALLOC_SYSTEM_INODE;
 	struct ocfs2_info_freeinode *oifi = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
 		goto out_err;
 	}
 
-	if (o2info_from_user(*oifi, req))
-		goto bail;
+	if (o2info_from_user(*oifi, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
 
 	oifi->ifi_slotnum = osb->max_slots;
 
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
 
 	o2info_set_request_filled(&oifi->ifi_req);
 
-	if (o2info_to_user(*oifi, req))
-		goto bail;
+	if (o2info_to_user(*oifi, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
 
 	status = 0;
 bail:
 	if (status)
 		o2info_set_request_error(&oifi->ifi_req, req);
-
+out_free:
 	kfree(oifi);
 out_err:
 	return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
 {
 	u64 blkno = -1;
 	char namebuf[40];
-	int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+	int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
 
 	struct ocfs2_info_freefrag *oiff;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
 		goto out_err;
 	}
 
-	if (o2info_from_user(*oiff, req))
-		goto bail;
+	if (o2info_from_user(*oiff, req)) {
+		status = -EFAULT;
+		goto out_free;
+	}
 	/*
 	 * chunksize from userspace should be power of 2.
 	 */
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
 
 	if (o2info_to_user(*oiff, req)) {
 		status = -EFAULT;
-		goto bail;
+		goto out_free;
 	}
 
 	status = 0;
 bail:
 	if (status)
 		o2info_set_request_error(&oiff->iff_req, req);
-
+out_free:
 	kfree(oiff);
 out_err:
 	return status;
@@ -727,23 +690,17 @@ out_err:
 static int ocfs2_info_handle_unknown(struct inode *inode,
 				     struct ocfs2_info_request __user *req)
 {
-	int status = -EFAULT;
 	struct ocfs2_info_request oir;
 
 	if (o2info_from_user(oir, req))
-		goto bail;
+		return -EFAULT;
 
 	o2info_clear_request_filled(&oir);
 
 	if (o2info_to_user(oir, req))
-		goto bail;
+		return -EFAULT;
 
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oir, req);
-
-	return status;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From c43c363def04cdaed0d9e26dae846081f55714e7 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 29 Aug 2014 15:19:00 -0700
Subject: ocfs2: o2net: don't shutdown connection when idle timeout

This patch series is to fix a possible message lost bug in ocfs2 when
network go bad.  This bug will cause ocfs2 hung forever even network
become good again.

The messages may lost in this case.  After the tcp connection is
established between two nodes, an idle timer will be set to check its
state periodically, if no messages are received during this time, idle
timer will timeout, it will shutdown the connection and try to
reconnect, so pending messages in tcp queues will be lost.  This
messages may be from dlm.  Dlm may get hung in this case.  This may
cause the whole ocfs2 cluster hung.

This is very possible to happen when network state goes bad.  Do the
reconnect is useless, it will fail if network state is still bad.  Just
waiting there for network recovering may be a good idea, it will not
lost messages and some node will be fenced until cluster goes into
split-brain state, for this case, Tcp user timeout is used to override
the tcp retransmit timeout.  It will timeout after 25 days, user should
have notice this through the provided log and fix the network, if they
don't, ocfs2 will fall back to original reconnect way.

This patch (of 3):

Some messages in the tcp queue maybe lost if we shutdown the connection
and reconnect when idle timeout.  If packets lost and reconnect success,
then the ocfs2 cluster maybe hung.

To fix this, we can leave the connection there and do the fence decision
when idle timeout, if network recover before fence dicision is made, the
connection survive without lost any messages.

This bug can be saw when network state go bad.  It may cause ocfs2 hung
forever if some packets lost.  With this fix, ocfs2 will recover from
hung if network becomes good again.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..2334bfc966c1 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1536,16 +1536,20 @@ static void o2net_idle_timer(unsigned long data)
 #endif
 
 	printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
-	       "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
-	       msecs / 1000, msecs % 1000);
+	       "idle for %lu.%lu secs.\n",
+	       SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
 
-	/*
-	 * Initialize the nn_timeout so that the next connection attempt
-	 * will continue in o2net_start_connect.
+	/* idle timerout happen, don't shutdown the connection, but
+	 * make fence decision. Maybe the connection can recover before
+	 * the decision is made.
 	 */
 	atomic_set(&nn->nn_timeout, 1);
+	o2quo_conn_err(o2net_num_from_nn(nn));
+	queue_delayed_work(o2net_wq, &nn->nn_still_up,
+			msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+
+	o2net_sc_reset_idle_timer(sc);
 
-	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1564,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
 {
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	/* clear fence decision since the connection recover from timeout*/
+	if (atomic_read(&nn->nn_timeout)) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		cancel_delayed_work(&nn->nn_still_up);
+		atomic_set(&nn->nn_timeout, 0);
+	}
+
 	/* Only push out an existing timer */
 	if (timer_pending(&sc->sc_idle_timeout))
 		o2net_sc_reset_idle_timer(sc);
-- 
cgit v1.2.3-59-g8ed1b


From 8e9801dfe37c9e68cdbfcd15988df2187191864e Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 29 Aug 2014 15:19:02 -0700
Subject: ocfs2: o2net: set tcp user timeout to max value

When tcp retransmit timeout(15mins), the connection will be closed.
Pending messages may be lost during this time.  So we set tcp user
timeout to override the retransmit timeout to the max value.  This is OK
for ocfs2 since we have disk heartbeat, if peer crash, the disk
heartbeat will timeout and it will be evicted, if disk heartbeat not
timeout and connection idle for a long time, then this means the cluster
enters split-brain state, since fence can't happen, we'd better keep the
connection and wait network recover.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 20 ++++++++++++++++++++
 fs/ocfs2/cluster/tcp.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2334bfc966c1..ea34952f9496 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
 	return ret;
 }
 
+static int o2net_set_usertimeout(struct socket *sock)
+{
+	int user_timeout = O2NET_TCP_USER_TIMEOUT;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+				(char *)&user_timeout, sizeof(user_timeout));
+}
+
 static void o2net_initialize_handshake(void)
 {
 	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1663,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work)
 		goto out;
 	}
 
+	ret = o2net_set_usertimeout(sock);
+	if (ret) {
+		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+		goto out;
+	}
+
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 
 	spin_lock(&nn->nn_lock);
@@ -1844,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
 		goto out;
 	}
 
+	ret = o2net_set_usertimeout(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+		goto out;
+	}
+
 	slen = sizeof(sin);
 	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
 				       &slen, 1);
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT	2000
 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT		30000
 
+#define O2NET_TCP_USER_TIMEOUT			0x7fffffff
 
 /* TODO: figure this out.... */
 static inline int o2net_link_down(int err, struct socket *sock)
-- 
cgit v1.2.3-59-g8ed1b


From 8c7b638cece146234b0c0d5f6ba84d1cf6f81e83 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 29 Aug 2014 15:19:04 -0700
Subject: ocfs2: quorum: add a log for node not fenced

For debug use, we can see from the log whether the fence decision is
made and why it is not fenced.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/quorum.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
 	}
 
 out:
-	spin_unlock(&qs->qs_lock);
-	if (fence)
+	if (fence) {
+		spin_unlock(&qs->qs_lock);
 		o2quo_fence_self();
+	} else {
+		mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+			"connected: %d, lowest: %d (%sreachable)\n",
+			qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+			lowest_reachable ? "" : "un");
+		spin_unlock(&qs->qs_lock);
+
+	}
+
 }
 
 static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
-- 
cgit v1.2.3-59-g8ed1b