From 120a75ea9f4ba02f852171e75d44f29139b9c83e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:39 +0800 Subject: libceph: add function that reset client's entity addr This function also re-open connections to OSD/MON, and re-send in-flight OSD requests after re-opening connections to OSD. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- net/ceph/ceph_common.c | 8 ++++++++ net/ceph/messenger.c | 6 ++++++ net/ceph/mon_client.c | 7 +++++++ net/ceph/osd_client.c | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..b412a3ccc4fc 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -694,6 +694,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..a252ba80dd82 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5086,6 +5086,24 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_call); +/* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + /* * init, shutdown */ -- cgit v1.2.3-59-g8ed1b From 2cef0ba8032ce0df3f29621921fe7d6372f68d3e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:40 +0800 Subject: libceph: add function that clears osd client's abort_err Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d1c3e829f30d..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -389,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a252ba80dd82..6f49c59e5e58 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2476,6 +2476,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { -- cgit v1.2.3-59-g8ed1b From 4766815b1179dbe4fe263a5f95795710e29276e2 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 3 Jul 2019 17:59:20 +0200 Subject: libceph: handle OSD op ceph_pagelist_append() errors osd_req_op_cls_init() and osd_req_op_xattr_init() currently propagate ceph_pagelist_alloc() ENOMEM errors but ignore ceph_pagelist_append() memory allocation failures. Add these checks and cleanup on error. Signed-off-by: David Disseldorp Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f49c59e5e58..2f44971e1347 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); -- cgit v1.2.3-59-g8ed1b From 8edf84ba4d0ecad7d1c5b393ba004b959dae76ea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 21 Aug 2019 13:57:18 +0200 Subject: libceph: drop unused con parameter of calc_target() This bit was omitted from a561372405cf ("libceph: fix PG split vs OSD (re)connect race") to avoid backport conflicts. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2f44971e1347..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1505,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -2289,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -3112,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3729,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3801,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3911,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); -- cgit v1.2.3-59-g8ed1b From 10c12851a022662bf6085bd4384b4ebed4c447ce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 30 Aug 2019 20:44:24 +0200 Subject: libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc() The vmalloc allocator doesn't fully respect the specified gfp mask: while the actual pages are allocated as requested, the page table pages are always allocated with GFP_KERNEL. ceph_kvmalloc() may be called with GFP_NOFS and GFP_NOIO (for ceph and rbd respectively), so this may result in a deadlock. There is no real reason for the current PAGE_ALLOC_COSTLY_ORDER logic, it's just something that seemed sensible at the time (ceph_kvmalloc() predates kvmalloc()). kvmalloc() is smarter: in an attempt to reduce long term fragmentation, it first tries to kmalloc non-disruptively. Switch to kvmalloc() and set the respective PF_MEMALLOC_* flag using the scope API to avoid the deadlock. Note that kvmalloc() needs to be passed GFP_KERNEL to enable the fallback. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/ceph_common.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index b412a3ccc4fc..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; -- cgit v1.2.3-59-g8ed1b From cf73d882cc51c1f245a890cccb79952a260302d3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 4 Sep 2019 15:40:03 +0200 Subject: libceph: use ceph_kvmalloc() for osdmap arrays osdmap has a bunch of arrays that grow linearly with the number of OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. The CRUSH workspace area also grows linearly with the number of OSDs. Normally these arrays are allocated at client startup. The osdmap is usually updated in small incrementals, but once in a while a full map may need to be processed. For a cluster with 10000 OSDs, this means a bunch of 40K allocations followed by a 1.3M allocation, all of which are currently required to be physically contiguous. This results in sporadic ENOMEM errors, hanging the client. Go back to manually (re)allocating arrays and use ceph_kvmalloc() to fall back to non-contiguous allocation when necessary. Link: https://tracker.ceph.com/issues/40481 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osdmap.c | 69 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) - return -ENOMEM; - map->osd_state = state; + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; - map->osd_weight = weight; + } - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; - map->osd_addr = addr; + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } -- cgit v1.2.3-59-g8ed1b