From 8618e30bc14b06bfafa0f164cca7b0e06451f88a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 8 Oct 2012 20:37:30 -0700
Subject: rbd: let con_work() handle backoff

Both ceph_fault() and con_work() include handling for imposing a
delay before doing further processing on a faulted connection.
The latter is used only if ceph_fault() is unable to.

Instead, just let con_work() always be responsible for implementing
the delay.  After setting up the delay value, set the BACKOFF flag
on the connection unconditionally and call queue_con() to ensure
con_work() will get called to handle it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cad0d17ec45e..973c16c20c42 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2398,24 +2398,8 @@ static void ceph_fault(struct ceph_connection *con)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
 			con->delay *= 2;
-		con->ops->get(con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay))) {
-			dout("fault queued %p delay %lu\n", con, con->delay);
-		} else {
-			con->ops->put(con);
-			dout("fault failed to queue %p delay %lu, backoff\n",
-			     con, con->delay);
-			/*
-			 * In many cases we see a socket state change
-			 * while con_work is running and end up
-			 * queuing (non-delayed) work, such that we
-			 * can't backoff with a delay.  Set a flag so
-			 * that when con_work restarts we schedule the
-			 * delay then.
-			 */
-			set_bit(CON_FLAG_BACKOFF, &con->flags);
-		}
+		set_bit(CON_FLAG_BACKOFF, &con->flags);
+		queue_con(con);
 	}
 
 out_unlock:
-- 
cgit v1.2.3-59-g8ed1b


From 802c6d967fbdcd2cbc91b917425661bb8bbfaade Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 8 Oct 2012 20:37:30 -0700
Subject: rbd: define common queue_con_delay()

This patch defines a single function, queue_con_delay() to call
queue_delayed_work() for a connection.  It basically generalizes
what was previously queue_con() by adding the delay argument.
queue_con() is now a simple helper that passes 0 for its delay.
queue_con_delay() returns 0 if it queued work or an errno if it
did not for some reason.

If con_work() finds the BACKOFF flag set for a connection, it now
calls queue_con_delay() to handle arranging to start again after a
delay.

Note about connection reference counts:  con_work() only ever gets
called as a work item function.  At the time that work is scheduled,
a reference to the connection is acquired, and the corresponding
con_work() call is then responsible for dropping that reference
before it returns.

Previously, the backoff handling inside con_work() silently handed
off its reference to delayed work it scheduled.  Now that
queue_con_delay() is used, a new reference is acquired for the
newly-scheduled work, and the original reference is dropped by the
con->ops->put() call at the end of the function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 973c16c20c42..66f6f56bcb23 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2244,22 +2244,33 @@ bad_tag:
 
 
 /*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
+ * Atomically queue work on a connection after the specified delay.
+ * Bump @con reference to avoid races with connection teardown.
+ * Returns 0 if work was queued, or an error code otherwise.
  */
-static void queue_con(struct ceph_connection *con)
+static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 {
 	if (!con->ops->get(con)) {
-		dout("queue_con %p ref count 0\n", con);
-		return;
+		dout("%s %p ref count 0\n", __func__, con);
+
+		return -ENOENT;
 	}
 
-	if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
-		dout("queue_con %p - already queued\n", con);
+	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
+		dout("%s %p - already queued\n", __func__, con);
 		con->ops->put(con);
-	} else {
-		dout("queue_con %p\n", con);
+
+		return -EBUSY;
 	}
+
+	dout("%s %p %lu\n", __func__, con, delay);
+
+	return 0;
+}
+
+static void queue_con(struct ceph_connection *con)
+{
+	(void) queue_con_delay(con, 0);
 }
 
 /*
@@ -2294,14 +2305,11 @@ restart:
 
 	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay))) {
-			dout("con_work %p backoff %lu\n", con, con->delay);
-			mutex_unlock(&con->mutex);
-			return;
-		} else {
+		ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+		if (ret) {
 			dout("con_work %p FAILED to back off %lu\n", con,
 			     con->delay);
+			BUG_ON(ret == -ENOENT);
 			set_bit(CON_FLAG_BACKOFF, &con->flags);
 		}
 		goto done;
-- 
cgit v1.2.3-59-g8ed1b


From 7246240c7c186542f73af4fadc744d66440f616f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 25 Oct 2012 08:49:41 -0700
Subject: libceph: avoid NULL kref_put from NULL alloc_msg return

The ceph_on_in_msg_alloc() method calls the ->alloc_msg() helper which
may return NULL.  It also drops con->mutex while it allocates a message,
which means that the connection state may change (e.g., get closed).  If
that happens, we clean up and bail out.  Avoid calling ceph_msg_put() on
a NULL return value and triggering a crash.

This was observed when an ->alloc_msg() call races with a timeout that
resends a zillion messages and resets the connection, and ->alloc_msg()
returns NULL (because the request was resent to another target).

Fixes http://tracker.newdream.net/issues/3342

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/messenger.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 66f6f56bcb23..1041114453db 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2742,7 +2742,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 		msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
 		if (con->state != CON_STATE_OPEN) {
-			ceph_msg_put(msg);
+			if (msg)
+				ceph_msg_put(msg);
 			return -EAGAIN;
 		}
 		con->in_msg = msg;
-- 
cgit v1.2.3-59-g8ed1b


From 0ed7285e0001b960c888e5455ae982025210ed3d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 29 Oct 2012 11:01:42 -0700
Subject: libceph: fix osdmap decode error paths

Ensure that we set the err value correctly so that we do not pass a 0
value to ERR_PTR and confuse the calling code.  (In particular,
osd_client.c handle_map() will BUG(!newmap)).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 5433fb0eb3c6..f552aa48fd9e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -645,10 +645,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
 		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		err = -ENOMEM;
 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
 		pi->id = ceph_decode_32(p);
+		err = -EINVAL;
 		ev = ceph_decode_8(p); /* encoding version */
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
@@ -664,8 +666,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
 
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
+	if (version >= 5) {
+		err = __decode_pool_names(p, end, map);
+		if (err < 0) {
+			dout("fail to decode pool names");
+			goto bad;
+		}
+	}
 
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
@@ -745,7 +752,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	return map;
 
 bad:
-	dout("osdmap_decode fail\n");
+	dout("osdmap_decode fail err %d\n", err);
 	ceph_osdmap_destroy(map);
 	return ERR_PTR(err);
 }
@@ -839,6 +846,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
 				   ev, CEPH_PG_POOL_VERSION);
+			err = -EINVAL;
 			goto bad;
 		}
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
@@ -855,8 +863,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		if (err < 0)
 			goto bad;
 	}
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
+	if (version >= 5) {
+		err = __decode_pool_names(p, end, map);
+		if (err < 0)
+			goto bad;
+	}
 
 	/* old_pool */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -932,15 +943,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			(void) __remove_pg_mapping(&map->pg_temp, pgid);
 
 			/* insert */
-			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
-				err = -EINVAL;
+			err = -EINVAL;
+			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
 				goto bad;
-			}
+			err = -ENOMEM;
 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg) {
-				err = -ENOMEM;
+			if (!pg)
 				goto bad;
-			}
 			pg->pgid = pgid;
 			pg->len = pglen;
 			for (j = 0; j < pglen; j++)
-- 
cgit v1.2.3-59-g8ed1b


From 72afc71ffca0f444ee0e1ef8c7e34ab209bb48b3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Oct 2012 19:40:33 -0500
Subject: libceph: define ceph_pg_pool_name_by_id()

Define and export function ceph_pg_pool_name_by_id() to supply
the name of a pg pool whose id is given.  This will be used by
the next patch.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osdmap.h |  1 +
 net/ceph/osdmap.c           | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e88a620b9f8a..5ea57ba69320 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -123,6 +123,7 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
+extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
 
 #endif
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f552aa48fd9e..de73214b5d26 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -469,6 +469,22 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	if (id == CEPH_NOPOOL)
+		return NULL;
+
+	if (WARN_ON_ONCE(id > (u64) INT_MAX))
+		return NULL;
+
+	pi = __lookup_pg_pool(&map->pg_pools, (int) id);
+
+	return pi ? pi->name : NULL;
+}
+EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
+
 int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
 {
 	struct rb_node *rbp;
-- 
cgit v1.2.3-59-g8ed1b


From 83aff95eb9d60aff5497e9f44a2ae906b86d8e88 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 28 Nov 2012 12:28:24 -0800
Subject: libceph: remove 'osdtimeout' option

This would reset a connection with any OSD that had an outstanding
request that was taking more than N seconds.  The idea was that if the
OSD was buggy, the client could compensate by resending the request.

In reality, this only served to hide server bugs, and we haven't
actually seen such a bug in quite a while.  Moreover, the userspace
client code never did this.

More importantly, often the request is taking a long time because the
OSD is trying to recover, or overloaded, and killing the connection
and retrying would only make the situation worse by giving the OSD
more work to do.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/super.c              |  2 --
 include/linux/ceph/libceph.h |  2 --
 net/ceph/ceph_common.c       |  3 +--
 net/ceph/osd_client.c        | 47 ++++----------------------------------------
 4 files changed, 5 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2f586b0e5e0f..fcda1c73a1e5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
 		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
-	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, ",osdkeepalivetimeout=%d",
 			   opt->osd_keepalive_timeout);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 42624789b06f..317aff8feb0a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,7 +43,6 @@ struct ceph_options {
 	struct ceph_entity_addr my_addr;
 	int mount_timeout;
 	int osd_idle_ttl;
-	int osd_timeout;
 	int osd_keepalive_timeout;
 
 	/*
@@ -63,7 +62,6 @@ struct ceph_options {
  * defaults
  */
 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a8020293f342..ee71ea26777a 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -305,7 +305,6 @@ ceph_parse_options(char *options, const char *dev_name,
 
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
-	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
@@ -391,7 +390,7 @@ ceph_parse_options(char *options, const char *dev_name,
 
 			/* misc */
 		case Opt_osdtimeout:
-			opt->osd_timeout = intval;
+			pr_warning("ignoring deprecated osdtimeout option\n");
 			break;
 		case Opt_osdkeepalivetimeout:
 			opt->osd_keepalive_timeout = intval;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccbdfbba9e53..7ebfe13267e6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -608,14 +608,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 	}
 }
 
-static void kick_osd_requests(struct ceph_osd_client *osdc,
-			      struct ceph_osd *kickosd)
-{
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, kickosd);
-	mutex_unlock(&osdc->request_mutex);
-}
-
 /*
  * If the osd connection drops, we need to resubmit all requests.
  */
@@ -629,7 +621,9 @@ static void osd_reset(struct ceph_connection *con)
 	dout("osd_reset osd%d\n", osd->o_osd);
 	osdc = osd->o_osdc;
 	down_read(&osdc->map_sem);
-	kick_osd_requests(osdc, osd);
+	mutex_lock(&osdc->request_mutex);
+	__kick_osd_requests(osdc, osd);
+	mutex_unlock(&osdc->request_mutex);
 	send_queued(osdc);
 	up_read(&osdc->map_sem);
 }
@@ -1091,12 +1085,10 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
-	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
 	unsigned long keepalive =
 		osdc->client->options->osd_keepalive_timeout * HZ;
-	unsigned long last_stamp = 0;
 	struct list_head slow_osds;
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -1105,37 +1097,6 @@ static void handle_timeout(struct work_struct *work)
 
 	mutex_lock(&osdc->request_mutex);
 
-	/*
-	 * reset osds that appear to be _really_ unresponsive.  this
-	 * is a failsafe measure.. we really shouldn't be getting to
-	 * this point if the system is working properly.  the monitors
-	 * should mark the osd as failed and we should find out about
-	 * it from an updated osd map.
-	 */
-	while (timeout && !list_empty(&osdc->req_lru)) {
-		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-				 r_req_lru_item);
-
-		/* hasn't been long enough since we sent it? */
-		if (time_before(jiffies, req->r_stamp + timeout))
-			break;
-
-		/* hasn't been long enough since it was acked? */
-		if (req->r_request->ack_stamp == 0 ||
-		    time_before(jiffies, req->r_request->ack_stamp + timeout))
-			break;
-
-		BUG_ON(req == last_req && req->r_stamp == last_stamp);
-		last_req = req;
-		last_stamp = req->r_stamp;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-			   req->r_tid, osd->o_osd);
-		__kick_osd_requests(osdc, osd);
-	}
-
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen
-- 
cgit v1.2.3-59-g8ed1b


From 7d5f24812bd182a2471cb69c1c2baf0648332e1f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 29 Nov 2012 08:37:03 -0600
Subject: ceph: don't reference req after put

In __unregister_request(), there is a call to list_del_init()
referencing a request that was the subject of a call to
ceph_osdc_put_request() on the previous line.  This is not
safe, because the request structure could have been freed
by the time we reach the list_del_init().

Fix this by reversing the order of these lines.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7ebfe13267e6..ac7be7202faa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -871,9 +871,9 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 			req->r_osd = NULL;
 	}
 
+	list_del_init(&req->r_req_lru_item);
 	ceph_osdc_put_request(req);
 
-	list_del_init(&req->r_req_lru_item);
 	if (osdc->num_requests == 0) {
 		dout(" no requests, canceling timeout\n");
 		__cancel_osd_timeout(osdc);
-- 
cgit v1.2.3-59-g8ed1b


From 685a7555ca69030739ddb57a47f0ea8ea80196a4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 09:57:58 -0600
Subject: libceph: avoid using freed osd in __kick_osd_requests()

If an osd has no requests and no linger requests, __reset_osd()
will just remove it with a call to __remove_osd().  That drops
a reference to the osd, and therefore the osd may have been free
by the time __reset_osd() returns.  That function offers no
indication this may have occurred, and as a result the osd will
continue to be used even when it's no longer valid.

Change__reset_osd() so it returns an error (ENODEV) when it
deletes the osd being reset.  And change __kick_osd_requests() so it
returns immediately (before referencing osd again) if __reset_osd()
returns *any* error.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac7be7202faa..60c74c1f1ea9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -581,7 +581,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 
 	dout("__kick_osd_requests osd%d\n", osd->o_osd);
 	err = __reset_osd(osdc, osd);
-	if (err == -EAGAIN)
+	if (err)
 		return;
 
 	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
@@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
 		__remove_osd(osdc, osd);
+		ret = -ENODEV;
 	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
 			  &osd->o_con.peer_addr,
 			  sizeof(osd->o_con.peer_addr)) == 0 &&
-- 
cgit v1.2.3-59-g8ed1b


From 61c74035626beb25a39b0273ccf7d75510bc36a1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 09:37:23 -0600
Subject: rbd: remove linger unconditionally

In __unregister_linger_request(), the request is being removed
from the osd client's req_linger list only when the request
has a non-null osd pointer.  It should be done whether or not
the request currently has an osd.

This is most likely a non-issue because I believe the request
will always have an osd when this function is called.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 60c74c1f1ea9..32bd696b39a8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -906,8 +906,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req)
 {
 	dout("__unregister_linger_request %p\n", req);
+	list_del_init(&req->r_linger_item);
 	if (req->r_osd) {
-		list_del_init(&req->r_linger_item);
 		list_del_init(&req->r_linger_osd);
 
 		if (list_empty(&req->r_osd->o_requests) &&
-- 
cgit v1.2.3-59-g8ed1b


From 7bb21d68c535ad8be38e14a715632ae398b37ac1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 19:50:07 -0600
Subject: libceph: socket can close in any connection state

A connection's socket can close for any reason, independent of the
state of the connection (and without irrespective of the connection
mutex).  As a result, the connectino can be in pretty much any state
at the time its socket is closed.

Handle those other cases at the top of con_work().  Pull this whole
block of code into a separate function to reduce the clutter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1041114453db..4b04ccc60db1 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con)
 	(void) queue_con_delay(con, 0);
 }
 
+static bool con_sock_closed(struct ceph_connection *con)
+{
+	if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
+		return false;
+
+#define CASE(x)								\
+	case CON_STATE_ ## x:						\
+		con->error_msg = "socket closed (con state " #x ")";	\
+		break;
+
+	switch (con->state) {
+	CASE(CLOSED);
+	CASE(PREOPEN);
+	CASE(CONNECTING);
+	CASE(NEGOTIATING);
+	CASE(OPEN);
+	CASE(STANDBY);
+	default:
+		pr_warning("%s con %p unrecognized state %lu\n",
+			__func__, con, con->state);
+		con->error_msg = "unrecognized con state";
+		BUG();
+		break;
+	}
+#undef CASE
+
+	return true;
+}
+
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
@@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
-	if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
-		switch (con->state) {
-		case CON_STATE_CONNECTING:
-			con->error_msg = "connection failed";
-			break;
-		case CON_STATE_NEGOTIATING:
-			con->error_msg = "negotiation failed";
-			break;
-		case CON_STATE_OPEN:
-			con->error_msg = "socket closed";
-			break;
-		default:
-			dout("unrecognized con state %d\n", (int)con->state);
-			con->error_msg = "unrecognized con state";
-			BUG();
-		}
+	if (con_sock_closed(con))
 		goto fault;
-	}
 
 	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
 		dout("con_work %p backing off\n", con);
-- 
cgit v1.2.3-59-g8ed1b


From 28362986f8743124b3a0fda20a8ed3e80309cce1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 14 Dec 2012 16:47:41 -0600
Subject: libceph: report connection fault with warning

When a connection's socket disconnects, or if there's a protocol
error of some kind on the connection, a fault is signaled and
the connection is reset (closed and reopened, basically).  We
currently get an error message on the log whenever this occurs.

A ceph connection will attempt to reestablish a socket connection
repeatedly if a fault occurs.  This means that these error messages
will get repeatedly added to the log, which is undesirable.

Change the error message to be a warning, so they don't get
logged by default.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4b04ccc60db1..4d111fd2b492 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2377,7 +2377,7 @@ fault:
 static void ceph_fault(struct ceph_connection *con)
 	__releases(con->mutex)
 {
-	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
-- 
cgit v1.2.3-59-g8ed1b


From f407731d12214e7686819018f3a1e9d7b6f83a02 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 07:22:04 -0600
Subject: libceph: init osd->o_node in create_osd()

The red-black node node in the ceph osd structure is not initialized
in create_osd().  Because this node can be the subject of a
RB_EMPTY_NODE() call later on, we should ensure the node is
initialized properly for that.  Add a call to RB_CLEAR_NODE()
initialize it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 32bd696b39a8..a6dc6acd6566 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 	atomic_set(&osd->o_ref, 1);
 	osd->o_osdc = osdc;
 	osd->o_osd = onum;
+	RB_CLEAR_NODE(&osd->o_node);
 	INIT_LIST_HEAD(&osd->o_requests);
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);
-- 
cgit v1.2.3-59-g8ed1b


From 3ee5234df68d253c415ba4f2db72ad250d9c21a9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 17 Dec 2012 12:23:48 -0600
Subject: libceph: init event->node in ceph_osdc_create_event()

The red-black node node in the ceph osd event structure is not
initialized in create_osdc_create_event().  Because this node can
be the subject of a RB_EMPTY_NODE() call later on, we should ensure
the node is initialized properly for that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a6dc6acd6566..2bce3d4be1c6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1563,6 +1563,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 	event->data = data;
 	event->osdc = osdc;
 	INIT_LIST_HEAD(&event->osd_node);
+	RB_CLEAR_NODE(&event->node);
 	kref_init(&event->kref);   /* one ref for us */
 	kref_get(&event->kref);    /* one ref for the caller */
 	init_completion(&event->completion);
-- 
cgit v1.2.3-59-g8ed1b


From a978fa20fb657548561dddbfb605fe43654f0825 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 17 Dec 2012 12:23:48 -0600
Subject: libceph: don't use rb_init_node() in ceph_osdc_alloc_request()

The red-black node in the ceph osd request structure is initialized
in ceph_osdc_alloc_request() using rbd_init_node().  We do need to
initialize this, because in __unregister_request() we call
RB_EMPTY_NODE(), which expects the node it's checking to have
been initialized.  But rb_init_node() is apparently overkill, and
may in fact be on its way out.  So use RB_CLEAR_NODE() instead.

For a little more background, see this commit:
    4c199a93 rbtree: empty nodes have no color"

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2bce3d4be1c6..7cd0a7f3bf43 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,7 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
-	rb_init_node(&req->r_node);
+	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd);
-- 
cgit v1.2.3-59-g8ed1b


From c89ce05e0c5a01a256100ac6a6019f276bdd1ca6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 6 Dec 2012 07:22:04 -0600
Subject: libceph: register request before unregister linger

In kick_requests(), we need to register the request before we
unregister the linger request.  Otherwise the unregister will
reset the request's osd pointer to NULL.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7cd0a7f3bf43..780caf6b0491 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1328,8 +1328,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 
 		dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
 		     req->r_osd ? req->r_osd->o_osd : -1);
-		__unregister_linger_request(osdc, req);
 		__register_request(osdc, req);
+		__unregister_linger_request(osdc, req);
 	}
 	mutex_unlock(&osdc->request_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b