From dccbf08005df800f5c8e948ab6132ed5536134bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 17 Feb 2018 09:29:58 +0100 Subject: libceph, ceph: change ceph_calc_file_object_mapping() signature - make it void - xlen (object extent length) out parameter should be u32 because only a single stripe unit is mapped at a time Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/osdmap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index d41fad99c0fa..92314035dac1 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -280,10 +280,9 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, bool any_change); -/* calculate mapping of a file extent to an object */ -extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 len, - u64 *bno, u64 *oxoff, u64 *oxlen); +void ceph_calc_file_object_mapping(struct ceph_file_layout *l, + u64 off, u64 len, + u64 *objno, u64 *objoff, u32 *xlen); int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_object_id *oid, -- cgit v1.2.3-59-g8ed1b From 5359a17d2706b86da2af83027343d5eb256f7670 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:10 +0100 Subject: libceph, rbd: new bio handling code (aka don't clone bios) The reason we clone bios is to be able to give each object request (and consequently each ceph_osd_data/ceph_msg_data item) its own pointer to a (list of) bio(s). The messenger then initializes its cursor with cloned bio's ->bi_iter, so it knows where to start reading from/writing to. That's all the cloned bios are used for: to determine each object request's starting position in the provided data buffer. Introduce ceph_bio_iter to do exactly that -- store position within bio list (i.e. pointer to bio) + position within that bio (i.e. bvec_iter). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 67 +++++++++++++++----------- include/linux/ceph/messenger.h | 59 +++++++++++++++++++---- include/linux/ceph/osd_client.h | 11 +++-- net/ceph/messenger.c | 101 ++++++++++++++-------------------------- net/ceph/osd_client.c | 13 ++++-- 5 files changed, 139 insertions(+), 112 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 883f17d6deeb..8eaebf609611 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -218,7 +218,7 @@ typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { OBJ_REQUEST_NODATA = 1, - OBJ_REQUEST_BIO, + OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ OBJ_REQUEST_PAGES, }; @@ -270,7 +270,7 @@ struct rbd_obj_request { enum obj_request_type type; union { - struct bio *bio_list; + struct ceph_bio_iter bio_pos; struct { struct page **pages; u32 page_count; @@ -1255,6 +1255,27 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, return length; } +static void zero_bvec(struct bio_vec *bv) +{ + void *buf; + unsigned long flags; + + buf = bvec_kmap_irq(bv, &flags); + memset(buf, 0, bv->bv_len); + flush_dcache_page(bv->bv_page); + bvec_kunmap_irq(buf, &flags); +} + +static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) +{ + struct ceph_bio_iter it = *bio_pos; + + ceph_bio_iter_advance(&it, off); + ceph_bio_iter_advance_step(&it, bytes, ({ + zero_bvec(&bv); + })); +} + /* * bio helpers */ @@ -1719,13 +1740,14 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); if (obj_request->result == -ENOENT) { if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, 0); + zero_bios(&obj_request->bio_pos, 0, length); else zero_pages(obj_request->pages, 0, length); obj_request->result = 0; } else if (xferred < length && !obj_request->result) { if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, xferred); + zero_bios(&obj_request->bio_pos, xferred, + length - xferred); else zero_pages(obj_request->pages, xferred, length); } @@ -2036,11 +2058,8 @@ static void rbd_obj_request_destroy(struct kref *kref) rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { case OBJ_REQUEST_NODATA: - break; /* Nothing to do */ case OBJ_REQUEST_BIO: - if (obj_request->bio_list) - bio_chain_put(obj_request->bio_list); - break; + break; /* Nothing to do */ case OBJ_REQUEST_PAGES: /* img_data requests don't own their page array */ if (obj_request->pages && @@ -2368,7 +2387,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, if (obj_request->type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_request, num_ops, - obj_request->bio_list, length); + &obj_request->bio_pos, length); else if (obj_request->type == OBJ_REQUEST_PAGES) osd_req_op_extent_osd_data_pages(osd_request, num_ops, obj_request->pages, length, @@ -2396,8 +2415,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - struct bio *bio_list = NULL; - unsigned int bio_offset = 0; + struct ceph_bio_iter bio_it; struct page **pages = NULL; enum obj_operation_type op_type; u64 img_offset; @@ -2412,9 +2430,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, op_type = rbd_img_request_op_type(img_request); if (type == OBJ_REQUEST_BIO) { - bio_list = data_desc; + bio_it = *(struct ceph_bio_iter *)data_desc; rbd_assert(img_offset == - bio_list->bi_iter.bi_sector << SECTOR_SHIFT); + bio_it.iter.bi_sector << SECTOR_SHIFT); } else if (type == OBJ_REQUEST_PAGES) { pages = data_desc; } @@ -2440,17 +2458,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, rbd_img_obj_request_add(img_request, obj_request); if (type == OBJ_REQUEST_BIO) { - unsigned int clone_size; - - rbd_assert(length <= (u64)UINT_MAX); - clone_size = (unsigned int)length; - obj_request->bio_list = - bio_chain_clone_range(&bio_list, - &bio_offset, - clone_size, - GFP_NOIO); - if (!obj_request->bio_list) - goto out_unwind; + obj_request->bio_pos = bio_it; + ceph_bio_iter_advance(&bio_it, length); } else if (type == OBJ_REQUEST_PAGES) { unsigned int page_count; @@ -2980,7 +2989,7 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) if (obj_request->type == OBJ_REQUEST_BIO) result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - obj_request->bio_list); + &obj_request->bio_pos); else result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, obj_request->pages); @@ -4093,9 +4102,13 @@ static void rbd_queue_workfn(struct work_struct *work) if (op_type == OBJ_OP_DISCARD) result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, NULL); - else + else { + struct ceph_bio_iter bio_it = { .bio = rq->bio, + .iter = rq->bio->bi_iter }; + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - rq->bio); + &bio_it); + } if (result) goto err_img_request; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ead9d85f1c11..d7b9605fd51d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -93,14 +93,60 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } } +#ifdef CONFIG_BLOCK + +struct ceph_bio_iter { + struct bio *bio; + struct bvec_iter iter; +}; + +#define __ceph_bio_iter_advance_step(it, n, STEP) do { \ + unsigned int __n = (n), __cur_n; \ + \ + while (__n) { \ + BUG_ON(!(it)->iter.bi_size); \ + __cur_n = min((it)->iter.bi_size, __n); \ + (void)(STEP); \ + bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \ + if (!(it)->iter.bi_size && (it)->bio->bi_next) { \ + dout("__ceph_bio_iter_advance_step next bio\n"); \ + (it)->bio = (it)->bio->bi_next; \ + (it)->iter = (it)->bio->bi_iter; \ + } \ + __n -= __cur_n; \ + } \ +} while (0) + +/* + * Advance @it by @n bytes. + */ +#define ceph_bio_iter_advance(it, n) \ + __ceph_bio_iter_advance_step(it, n, 0) + +/* + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. + */ +#define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \ + __ceph_bio_iter_advance_step(it, n, ({ \ + struct bio_vec bv; \ + struct bvec_iter __cur_iter; \ + \ + __cur_iter = (it)->iter; \ + __cur_iter.bi_size = __cur_n; \ + __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \ + (void)(BVEC_STEP); \ + })) + +#endif /* CONFIG_BLOCK */ + struct ceph_msg_data { struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK struct { - struct bio *bio; - size_t bio_length; + struct ceph_bio_iter bio_pos; + u32 bio_length; }; #endif /* CONFIG_BLOCK */ struct { @@ -122,10 +168,7 @@ struct ceph_msg_data_cursor { bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK - struct { /* bio */ - struct bio *bio; /* bio from list */ - struct bvec_iter bvec_iter; - }; + struct ceph_bio_iter bio_iter; #endif /* CONFIG_BLOCK */ struct { /* pages */ unsigned int page_offset; /* offset in page */ @@ -290,8 +333,8 @@ extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, - size_t length); +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, + u32 length); #endif /* CONFIG_BLOCK */ extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 52fb37d1c2a5..315691490cb0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -72,8 +72,8 @@ struct ceph_osd_data { struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK struct { - struct bio *bio; /* list of bios */ - size_t bio_length; /* total in list */ + struct ceph_bio_iter bio_pos; + u32 bio_length; }; #endif /* CONFIG_BLOCK */ }; @@ -405,9 +405,10 @@ extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, unsigned int which, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, - unsigned int which, - struct bio *bio, size_t bio_length); +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bio_iter *bio_pos, + u32 bio_length); #endif /* CONFIG_BLOCK */ extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 8a4d3758030b..b9fa8b869c08 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -839,90 +839,57 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; - struct bio *bio; + struct ceph_bio_iter *it = &cursor->bio_iter; - BUG_ON(data->type != CEPH_MSG_DATA_BIO); + cursor->resid = min_t(size_t, length, data->bio_length); + *it = data->bio_pos; + if (cursor->resid < it->iter.bi_size) + it->iter.bi_size = cursor->resid; - bio = data->bio; - BUG_ON(!bio); - - cursor->resid = min(length, data->bio_length); - cursor->bio = bio; - cursor->bvec_iter = bio->bi_iter; - cursor->last_piece = - cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { - struct ceph_msg_data *data = cursor->data; - struct bio *bio; - struct bio_vec bio_vec; - - BUG_ON(data->type != CEPH_MSG_DATA_BIO); - - bio = cursor->bio; - BUG_ON(!bio); + struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, + cursor->bio_iter.iter); - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - - *page_offset = (size_t) bio_vec.bv_offset; - BUG_ON(*page_offset >= PAGE_SIZE); - if (cursor->last_piece) /* pagelist offset is always 0 */ - *length = cursor->resid; - else - *length = (size_t) bio_vec.bv_len; - BUG_ON(*length > cursor->resid); - BUG_ON(*page_offset + *length > PAGE_SIZE); - - return bio_vec.bv_page; + *page_offset = bv.bv_offset; + *length = bv.bv_len; + return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { - struct bio *bio; - struct bio_vec bio_vec; - - BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); - - bio = cursor->bio; - BUG_ON(!bio); - - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + struct ceph_bio_iter *it = &cursor->bio_iter; - /* Advance the cursor offset */ - - BUG_ON(cursor->resid < bytes); + BUG_ON(bytes > cursor->resid); + BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; + bio_advance_iter(it->bio, &it->iter, bytes); - bio_advance_iter(bio, &cursor->bvec_iter, bytes); + if (!cursor->resid) { + BUG_ON(!cursor->last_piece); + return false; /* no more data */ + } - if (bytes < bio_vec.bv_len) + if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) return false; /* more bytes to process in this segment */ - /* Move on to the next segment, and possibly the next bio */ - - if (!cursor->bvec_iter.bi_size) { - bio = bio->bi_next; - cursor->bio = bio; - if (bio) - cursor->bvec_iter = bio->bi_iter; - else - memset(&cursor->bvec_iter, 0, - sizeof(cursor->bvec_iter)); - } - - if (!cursor->last_piece) { - BUG_ON(!cursor->resid); - BUG_ON(!bio); - /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) - cursor->last_piece = true; + if (!it->iter.bi_size) { + it->bio = it->bio->bi_next; + it->iter = it->bio->bi_iter; + if (cursor->resid < it->iter.bi_size) + it->iter.bi_size = cursor->resid; } + BUG_ON(cursor->last_piece); + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); return true; } #endif /* CONFIG_BLOCK */ @@ -1163,9 +1130,11 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, page = NULL; break; } + BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); + BUG_ON(*length > cursor->resid); if (last_piece) *last_piece = cursor->last_piece; @@ -3262,16 +3231,14 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg, EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK -void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, - size_t length) +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, + u32 length) { struct ceph_msg_data *data; - BUG_ON(!bio); - data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); - data->bio = bio; + data->bio_pos = *bio_pos; data->bio_length = length; list_add_tail(&data->links, &msg->data); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4b0485458d26..339d8773ebe8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -146,10 +146,11 @@ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, #ifdef CONFIG_BLOCK static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, - struct bio *bio, size_t bio_length) + struct ceph_bio_iter *bio_pos, + u32 bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; - osd_data->bio = bio; + osd_data->bio_pos = *bio_pos; osd_data->bio_length = bio_length; } #endif /* CONFIG_BLOCK */ @@ -216,12 +217,14 @@ EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, - unsigned int which, struct bio *bio, size_t bio_length) + unsigned int which, + struct ceph_bio_iter *bio_pos, + u32 bio_length) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_bio_init(osd_data, bio, bio_length); + ceph_osd_data_bio_init(osd_data, bio_pos, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ @@ -826,7 +829,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, ceph_msg_data_add_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_add_bio(msg, osd_data->bio, length); + ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); -- cgit v1.2.3-59-g8ed1b From b9e281c2b38804984d619e1d9efc4b9020bcb291 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 20 Jan 2018 10:30:11 +0100 Subject: libceph: introduce BVECS data type In preparation for rbd "fancy" striping, introduce ceph_bvec_iter for working with bio_vec array data buffers. The wrappers are trivial, but make it look similar to ceph_bio_iter. Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 42 +++++++++++++++++++++++ include/linux/ceph/osd_client.h | 8 +++++ net/ceph/messenger.c | 75 +++++++++++++++++++++++++++++++++++++++++ net/ceph/osd_client.c | 39 +++++++++++++++++++++ 4 files changed, 164 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index d7b9605fd51d..c7dfcb8a1fb2 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -76,6 +76,7 @@ enum ceph_msg_data_type { #ifdef CONFIG_BLOCK CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ #endif /* CONFIG_BLOCK */ + CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ }; static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) @@ -87,6 +88,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: return true; default: return false; @@ -139,6 +141,42 @@ struct ceph_bio_iter { #endif /* CONFIG_BLOCK */ +struct ceph_bvec_iter { + struct bio_vec *bvecs; + struct bvec_iter iter; +}; + +#define __ceph_bvec_iter_advance_step(it, n, STEP) do { \ + BUG_ON((n) > (it)->iter.bi_size); \ + (void)(STEP); \ + bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \ +} while (0) + +/* + * Advance @it by @n bytes. + */ +#define ceph_bvec_iter_advance(it, n) \ + __ceph_bvec_iter_advance_step(it, n, 0) + +/* + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. + */ +#define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \ + __ceph_bvec_iter_advance_step(it, n, ({ \ + struct bio_vec bv; \ + struct bvec_iter __cur_iter; \ + \ + __cur_iter = (it)->iter; \ + __cur_iter.bi_size = (n); \ + for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \ + (void)(BVEC_STEP); \ + })) + +#define ceph_bvec_iter_shorten(it, n) do { \ + BUG_ON((n) > (it)->iter.bi_size); \ + (it)->iter.bi_size = (n); \ +} while (0) + struct ceph_msg_data { struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; @@ -149,6 +187,7 @@ struct ceph_msg_data { u32 bio_length; }; #endif /* CONFIG_BLOCK */ + struct ceph_bvec_iter bvec_pos; struct { struct page **pages; /* NOT OWNER. */ size_t length; /* total # bytes */ @@ -170,6 +209,7 @@ struct ceph_msg_data_cursor { #ifdef CONFIG_BLOCK struct ceph_bio_iter bio_iter; #endif /* CONFIG_BLOCK */ + struct bvec_iter bvec_iter; struct { /* pages */ unsigned int page_offset; /* offset in page */ unsigned short page_index; /* index in array */ @@ -336,6 +376,8 @@ extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, u32 length); #endif /* CONFIG_BLOCK */ +void ceph_msg_data_add_bvecs(struct ceph_msg *msg, + struct ceph_bvec_iter *bvec_pos); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 315691490cb0..528ccc943cee 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -57,6 +57,7 @@ enum ceph_osd_data_type { #ifdef CONFIG_BLOCK CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ + CEPH_OSD_DATA_TYPE_BVECS, }; struct ceph_osd_data { @@ -76,6 +77,7 @@ struct ceph_osd_data { u32 bio_length; }; #endif /* CONFIG_BLOCK */ + struct ceph_bvec_iter bvec_pos; }; }; @@ -410,6 +412,9 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, struct ceph_bio_iter *bio_pos, u32 bio_length); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bvec_iter *bvec_pos); extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, unsigned int which, @@ -419,6 +424,9 @@ extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); +void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 bytes); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b9fa8b869c08..91a57857cf11 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -894,6 +894,58 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, } #endif /* CONFIG_BLOCK */ +static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio_vec *bvecs = data->bvec_pos.bvecs; + + cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); + cursor->bvec_iter = data->bvec_pos.iter; + cursor->bvec_iter.bi_size = cursor->resid; + + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->last_piece = + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); +} + +static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, + cursor->bvec_iter); + + *page_offset = bv.bv_offset; + *length = bv.bv_len; + return bv.bv_page; +} + +static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; + + BUG_ON(bytes > cursor->resid); + BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->resid -= bytes; + bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); + + if (!cursor->resid) { + BUG_ON(!cursor->last_piece); + return false; /* no more data */ + } + + if (!bytes || cursor->bvec_iter.bi_bvec_done) + return false; /* more bytes to process in this segment */ + + BUG_ON(cursor->last_piece); + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->last_piece = + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); + return true; +} + /* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. @@ -1077,6 +1129,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + ceph_msg_data_bvecs_cursor_init(cursor, length); + break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ @@ -1125,6 +1180,9 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + page = ceph_msg_data_bvecs_next(cursor, page_offset, length); + break; case CEPH_MSG_DATA_NONE: default: page = NULL; @@ -1163,6 +1221,9 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); + break; case CEPH_MSG_DATA_NONE: default: BUG(); @@ -3247,6 +3308,20 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ +void ceph_msg_data_add_bvecs(struct ceph_msg *msg, + struct ceph_bvec_iter *bvec_pos) +{ + struct ceph_msg_data *data; + + data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); + BUG_ON(!data); + data->bvec_pos = *bvec_pos; + + list_add_tail(&data->links, &msg->data); + msg->data_length += bvec_pos->iter.bi_size; +} +EXPORT_SYMBOL(ceph_msg_data_add_bvecs); + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 339d8773ebe8..407be0533c18 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -155,6 +155,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, } #endif /* CONFIG_BLOCK */ +static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, + struct ceph_bvec_iter *bvec_pos) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; + osd_data->bvec_pos = *bvec_pos; +} + #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ struct ceph_osd_request *__oreq = (oreq); \ @@ -229,6 +236,17 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bvec_iter *bvec_pos) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bvecs_init(osd_data, bvec_pos); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); + static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) @@ -266,6 +284,23 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); +void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 bytes) +{ + struct ceph_osd_data *osd_data; + struct ceph_bvec_iter it = { + .bvecs = bvecs, + .iter = { .bi_size = bytes }, + }; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_bvecs_init(osd_data, &it); + osd_req->r_ops[which].cls.indata_len += bytes; + osd_req->r_ops[which].indata_len += bytes; +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs); + void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -291,6 +326,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) case CEPH_OSD_DATA_TYPE_BIO: return (u64)osd_data->bio_length; #endif /* CONFIG_BLOCK */ + case CEPH_OSD_DATA_TYPE_BVECS: + return osd_data->bvec_pos.iter.bi_size; default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -831,6 +868,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); #endif + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { + ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } -- cgit v1.2.3-59-g8ed1b From ed0811d2d243c4195580a9671266031907c02ca7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 2 Feb 2018 15:23:22 +0100 Subject: libceph: striping framework implementation Signed-off-by: Ilya Dryomov --- include/linux/ceph/striper.h | 65 +++++++++++++ net/ceph/Makefile | 1 + net/ceph/striper.c | 226 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 292 insertions(+) create mode 100644 include/linux/ceph/striper.h create mode 100644 net/ceph/striper.c (limited to 'include/linux') diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h new file mode 100644 index 000000000000..74134ee5fdc8 --- /dev/null +++ b/include/linux/ceph/striper.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CEPH_STRIPER_H +#define _LINUX_CEPH_STRIPER_H + +#include +#include + +struct ceph_file_layout; + +struct ceph_object_extent { + struct list_head oe_item; + u64 oe_objno; + u64 oe_off; + u64 oe_len; +}; + +static inline void ceph_object_extent_init(struct ceph_object_extent *ex) +{ + INIT_LIST_HEAD(&ex->oe_item); +} + +/* + * Called for each mapped stripe unit. + * + * @bytes: number of bytes mapped, i.e. the minimum of the full length + * requested (file extent length) or the remainder of the stripe + * unit within an object + */ +typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex, + u32 bytes, void *arg); + +int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + struct ceph_object_extent *alloc_fn(void *arg), + void *alloc_arg, + ceph_object_extent_fn_t action_fn, + void *action_arg); +int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + ceph_object_extent_fn_t action_fn, + void *action_arg); + +struct ceph_file_extent { + u64 fe_off; + u64 fe_len; +}; + +static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents, + u32 num_file_extents) +{ + u64 bytes = 0; + u32 i; + + for (i = 0; i < num_file_extents; i++) + bytes += file_extents[i].fe_len; + + return bytes; +} + +int ceph_extent_to_file(struct ceph_file_layout *l, + u64 objno, u64 objoff, u64 objlen, + struct ceph_file_extent **file_extents, + u32 *num_file_extents); + +#endif diff --git a/net/ceph/Makefile b/net/ceph/Makefile index b4bded4b5396..12bf49772d24 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -8,6 +8,7 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ mon_client.o \ cls_lock_client.o \ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + striper.o \ debugfs.o \ auth.o auth_none.o \ crypto.o armor.o \ diff --git a/net/ceph/striper.c b/net/ceph/striper.c new file mode 100644 index 000000000000..bc1e4de30df9 --- /dev/null +++ b/net/ceph/striper.c @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +#include +#include + +#include +#include +#include + +/* + * Return the last extent with given objno (@object_extents is sorted + * by objno). If not found, return NULL and set @add_pos so that the + * new extent can be added with list_add(add_pos, new_ex). + */ +static struct ceph_object_extent * +lookup_last(struct list_head *object_extents, u64 objno, + struct list_head **add_pos) +{ + struct list_head *pos; + + list_for_each_prev(pos, object_extents) { + struct ceph_object_extent *ex = + list_entry(pos, typeof(*ex), oe_item); + + if (ex->oe_objno == objno) + return ex; + + if (ex->oe_objno < objno) + break; + } + + *add_pos = pos; + return NULL; +} + +static struct ceph_object_extent * +lookup_containing(struct list_head *object_extents, u64 objno, + u64 objoff, u32 xlen) +{ + struct ceph_object_extent *ex; + + list_for_each_entry(ex, object_extents, oe_item) { + if (ex->oe_objno == objno && + ex->oe_off <= objoff && + ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */ + return ex; + + if (ex->oe_objno > objno) + break; + } + + return NULL; +} + +/* + * Map a file extent to a sorted list of object extents. + * + * We want only one (or as few as possible) object extents per object. + * Adjacent object extents will be merged together, each returned object + * extent may reverse map to multiple different file extents. + * + * Call @alloc_fn for each new object extent and @action_fn for each + * mapped stripe unit, whether it was merged into an already allocated + * object extent or started a new object extent. + * + * Newly allocated object extents are added to @object_extents. + * To keep @object_extents sorted, successive calls to this function + * must map successive file extents (i.e. the list of file extents that + * are mapped using the same @object_extents must be sorted). + * + * The caller is responsible for @object_extents. + */ +int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + struct ceph_object_extent *alloc_fn(void *arg), + void *alloc_arg, + ceph_object_extent_fn_t action_fn, + void *action_arg) +{ + struct ceph_object_extent *last_ex, *ex; + + while (len) { + struct list_head *add_pos = NULL; + u64 objno, objoff; + u32 xlen; + + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, + &xlen); + + last_ex = lookup_last(object_extents, objno, &add_pos); + if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) { + ex = alloc_fn(alloc_arg); + if (!ex) + return -ENOMEM; + + ex->oe_objno = objno; + ex->oe_off = objoff; + ex->oe_len = xlen; + if (action_fn) + action_fn(ex, xlen, action_arg); + + if (!last_ex) + list_add(&ex->oe_item, add_pos); + else + list_add(&ex->oe_item, &last_ex->oe_item); + } else { + last_ex->oe_len += xlen; + if (action_fn) + action_fn(last_ex, xlen, action_arg); + } + + off += xlen; + len -= xlen; + } + + for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item), + ex = list_next_entry(last_ex, oe_item); + &ex->oe_item != object_extents; + last_ex = ex, ex = list_next_entry(ex, oe_item)) { + if (last_ex->oe_objno > ex->oe_objno || + (last_ex->oe_objno == ex->oe_objno && + last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) { + WARN(1, "%s: object_extents list not sorted!\n", + __func__); + return -EINVAL; + } + } + + return 0; +} +EXPORT_SYMBOL(ceph_file_to_extents); + +/* + * A stripped down, non-allocating version of ceph_file_to_extents(), + * for when @object_extents is already populated. + */ +int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + ceph_object_extent_fn_t action_fn, + void *action_arg) +{ + while (len) { + struct ceph_object_extent *ex; + u64 objno, objoff; + u32 xlen; + + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, + &xlen); + + ex = lookup_containing(object_extents, objno, objoff, xlen); + if (!ex) { + WARN(1, "%s: objno %llu %llu~%u not found!\n", + __func__, objno, objoff, xlen); + return -EINVAL; + } + + action_fn(ex, xlen, action_arg); + + off += xlen; + len -= xlen; + } + + return 0; +} +EXPORT_SYMBOL(ceph_iterate_extents); + +/* + * Reverse map an object extent to a sorted list of file extents. + * + * On success, the caller is responsible for: + * + * kfree(file_extents) + */ +int ceph_extent_to_file(struct ceph_file_layout *l, + u64 objno, u64 objoff, u64 objlen, + struct ceph_file_extent **file_extents, + u32 *num_file_extents) +{ + u32 stripes_per_object = l->object_size / l->stripe_unit; + u64 blockno; /* which su */ + u32 blockoff; /* offset into su */ + u64 stripeno; /* which stripe */ + u32 stripepos; /* which su in the stripe, + which object in the object set */ + u64 objsetno; /* which object set */ + u32 i = 0; + + if (!objlen) { + *file_extents = NULL; + *num_file_extents = 0; + return 0; + } + + *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) - + DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit); + *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents), + GFP_NOIO); + if (!*file_extents) + return -ENOMEM; + + div_u64_rem(objoff, l->stripe_unit, &blockoff); + while (objlen) { + u64 off, len; + + objsetno = div_u64_rem(objno, l->stripe_count, &stripepos); + stripeno = div_u64(objoff, l->stripe_unit) + + objsetno * stripes_per_object; + blockno = stripeno * l->stripe_count + stripepos; + off = blockno * l->stripe_unit + blockoff; + len = min_t(u64, objlen, l->stripe_unit - blockoff); + + (*file_extents)[i].fe_off = off; + (*file_extents)[i].fe_len = len; + + blockoff = 0; + objoff += len; + objlen -= len; + i++; + } + + BUG_ON(i != *num_file_extents); + return 0; +} +EXPORT_SYMBOL(ceph_extent_to_file); -- cgit v1.2.3-59-g8ed1b From 08c1ac508b6dc20ac866e7cdb7279245437c7d26 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 17 Feb 2018 10:41:20 +0100 Subject: libceph, ceph: move ceph_calc_file_object_mapping() to striper.c ceph_calc_file_object_mapping() has nothing to do with osdmaps. Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 1 + fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 5 ----- include/linux/ceph/striper.h | 4 ++++ net/ceph/osd_client.c | 1 + net/ceph/osdmap.c | 37 ------------------------------------- net/ceph/striper.c | 37 ++++++++++++++++++++++++++++++++++++- 7 files changed, 43 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c0fe1b6f47ac..c3557a9ea73d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -15,6 +15,7 @@ #include "mds_client.h" #include "cache.h" #include +#include /* * Ceph address space ops. diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index b855d24a895a..c90f03beb15d 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -5,7 +5,7 @@ #include "super.h" #include "mds_client.h" #include "ioctl.h" - +#include /* * ioctls diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 92314035dac1..e71fb222c7c3 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -5,7 +5,6 @@ #include #include #include -#include #include /* @@ -280,10 +279,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, bool any_change); -void ceph_calc_file_object_mapping(struct ceph_file_layout *l, - u64 off, u64 len, - u64 *objno, u64 *objoff, u32 *xlen); - int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h index 74134ee5fdc8..cbd0d24b7148 100644 --- a/include/linux/ceph/striper.h +++ b/include/linux/ceph/striper.h @@ -7,6 +7,10 @@ struct ceph_file_layout; +void ceph_calc_file_object_mapping(struct ceph_file_layout *l, + u64 off, u64 len, + u64 *objno, u64 *objoff, u32 *xlen); + struct ceph_object_extent { struct list_head oe_item; u64 oe_objno; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 407be0533c18..4a3af96dc057 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -20,6 +20,7 @@ #include #include #include +#include #define OSD_OPREPLY_FRONT_LEN 512 diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e3ebbe2ecdad..9645ffd6acfb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -4,7 +4,6 @@ #include #include -#include #include #include @@ -2140,42 +2139,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, return false; } -/* - * Map a file extent to a stripe unit within an object. - * Fill in objno, offset into object, and object extent length (i.e. the - * number of bytes mapped, less than or equal to @l->stripe_unit). - * - * Example for stripe_count = 3, stripes_per_object = 4: - * - * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19 - * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6 - * stripepos | 0 | 1 | 2 | 0 | 1 - * objno | 0 | 1 | 2 | 3 | 4 - * objsetno | 0 | 1 - */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *l, - u64 off, u64 len, - u64 *objno, u64 *objoff, u32 *xlen) -{ - u32 stripes_per_object = l->object_size / l->stripe_unit; - u64 blockno; /* which su in the file (i.e. globally) */ - u32 blockoff; /* offset into su */ - u64 stripeno; /* which stripe */ - u32 stripepos; /* which su in the stripe, - which object in the object set */ - u64 objsetno; /* which object set */ - u32 objsetpos; /* which stripe in the object set */ - - blockno = div_u64_rem(off, l->stripe_unit, &blockoff); - stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos); - objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos); - - *objno = objsetno * l->stripe_count + stripepos; - *objoff = objsetpos * l->stripe_unit + blockoff; - *xlen = min_t(u64, len, l->stripe_unit - blockoff); -} -EXPORT_SYMBOL(ceph_calc_file_object_mapping); - /* * Map an object into a PG. * diff --git a/net/ceph/striper.c b/net/ceph/striper.c index bc1e4de30df9..c36462dc86b7 100644 --- a/net/ceph/striper.c +++ b/net/ceph/striper.c @@ -5,10 +5,45 @@ #include #include -#include #include #include +/* + * Map a file extent to a stripe unit within an object. + * Fill in objno, offset into object, and object extent length (i.e. the + * number of bytes mapped, less than or equal to @l->stripe_unit). + * + * Example for stripe_count = 3, stripes_per_object = 4: + * + * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19 + * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6 + * stripepos | 0 | 1 | 2 | 0 | 1 + * objno | 0 | 1 | 2 | 3 | 4 + * objsetno | 0 | 1 + */ +void ceph_calc_file_object_mapping(struct ceph_file_layout *l, + u64 off, u64 len, + u64 *objno, u64 *objoff, u32 *xlen) +{ + u32 stripes_per_object = l->object_size / l->stripe_unit; + u64 blockno; /* which su in the file (i.e. globally) */ + u32 blockoff; /* offset into su */ + u64 stripeno; /* which stripe */ + u32 stripepos; /* which su in the stripe, + which object in the object set */ + u64 objsetno; /* which object set */ + u32 objsetpos; /* which stripe in the object set */ + + blockno = div_u64_rem(off, l->stripe_unit, &blockoff); + stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos); + objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos); + + *objno = objsetno * l->stripe_count + stripepos; + *objoff = objsetpos * l->stripe_unit + blockoff; + *xlen = min_t(u64, len, l->stripe_unit - blockoff); +} +EXPORT_SYMBOL(ceph_calc_file_object_mapping); + /* * Return the last extent with given objno (@object_extents is sorted * by objno). If not found, return NULL and set @add_pos so that the -- cgit v1.2.3-59-g8ed1b From bb48bd4dc45f9ee1e44d8e9fcb01023e0d0ba80d Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 13 Mar 2018 10:42:44 +0800 Subject: ceph: optimize memory usage In current code, regular file and directory use same struct ceph_file_info to store fs specific data so the struct has to include some fields which are only used for directory (e.g., readdir related info), when having plenty of regular files, it will lead to memory waste. This patch introduces dedicated ceph_dir_file_info cache for readdir related thins. So that regular file does not include those unused fields anymore. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 185 ++++++++++++++++++++++--------------------- fs/ceph/file.c | 88 ++++++++++++++------ fs/ceph/super.c | 8 ++ fs/ceph/super.h | 4 + include/linux/ceph/libceph.h | 1 + 5 files changed, 169 insertions(+), 117 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1aa3bfc9ef35..16405e0774a6 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -102,18 +102,18 @@ static int fpos_cmp(loff_t l, loff_t r) * regardless of what dir changes take place on the * server. */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, +static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, int len, unsigned next_offset) { char *buf = kmalloc(len+1, GFP_KERNEL); if (!buf) return -ENOMEM; - kfree(fi->last_name); - fi->last_name = buf; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - fi->next_offset = next_offset; - dout("note_last_dentry '%s'\n", fi->last_name); + kfree(dfi->last_name); + dfi->last_name = buf; + memcpy(dfi->last_name, name, len); + dfi->last_name[len] = 0; + dfi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", dfi->last_name); return 0; } @@ -175,7 +175,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, static int __dcache_readdir(struct file *file, struct dir_context *ctx, int shared_gen) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; @@ -222,7 +222,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, bool emit_dentry = false; dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); if (!dentry) { - fi->flags |= CEPH_F_ATEND; + dfi->file_info.flags |= CEPH_F_ATEND; err = 0; break; } @@ -273,33 +273,33 @@ out: if (last) { int ret; di = ceph_dentry(last); - ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len, fpos_off(di->offset) + 1); if (ret < 0) err = ret; dput(last); /* last_name no longer match cache index */ - if (fi->readdir_cache_idx >= 0) { - fi->readdir_cache_idx = -1; - fi->dir_release_count = 0; + if (dfi->readdir_cache_idx >= 0) { + dfi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; } } return err; } -static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos) { - if (!fi->last_readdir) + if (!dfi->last_readdir) return true; if (is_hash_order(pos)) - return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos)); else - return fi->frag != fpos_frag(pos); + return dfi->frag != fpos_frag(pos); } static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -310,7 +310,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); - if (fi->flags & CEPH_F_ATEND) + if (dfi->file_info.flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -351,15 +351,15 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (need_send_readdir(fi, ctx->pos)) { + if (need_send_readdir(dfi, ctx->pos)) { struct ceph_mds_request *req; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; /* discard old result, if any */ - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; } if (is_hash_order(ctx->pos)) { @@ -373,7 +373,7 @@ more: } dout("readdir fetching %llx.%llx frag %x offset '%s'\n", - ceph_vinop(inode), frag, fi->last_name); + ceph_vinop(inode), frag, dfi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -389,8 +389,8 @@ more: __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); req->r_inode_drop = CEPH_CAP_FILE_EXCL; } - if (fi->last_name) { - req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); + if (dfi->last_name) { + req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; @@ -400,10 +400,10 @@ more: cpu_to_le32(fpos_hash(ctx->pos)); } - req->r_dir_release_cnt = fi->dir_release_count; - req->r_dir_ordered_cnt = fi->dir_ordered_count; - req->r_readdir_cache_idx = fi->readdir_cache_idx; - req->r_readdir_offset = fi->next_offset; + req->r_dir_release_cnt = dfi->dir_release_count; + req->r_dir_ordered_cnt = dfi->dir_ordered_count; + req->r_readdir_cache_idx = dfi->readdir_cache_idx; + req->r_readdir_offset = dfi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.flags = cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); @@ -427,35 +427,35 @@ more: if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); if (!rinfo->hash_order) { - fi->next_offset = req->r_readdir_offset; + dfi->next_offset = req->r_readdir_offset; /* adjust ctx->pos to beginning of frag */ ctx->pos = ceph_make_fpos(frag, - fi->next_offset, + dfi->next_offset, false); } } - fi->frag = frag; - fi->last_readdir = req; + dfi->frag = frag; + dfi->last_readdir = req; if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { - fi->readdir_cache_idx = req->r_readdir_cache_idx; - if (fi->readdir_cache_idx < 0) { + dfi->readdir_cache_idx = req->r_readdir_cache_idx; + if (dfi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ - fi->dir_ordered_count = 0; + dfi->dir_ordered_count = 0; } else if (ceph_frag_is_leftmost(frag) && - fi->next_offset == 2) { + dfi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ - fi->dir_release_count = req->r_dir_release_cnt; - fi->dir_ordered_count = req->r_dir_ordered_cnt; + dfi->dir_release_count = req->r_dir_release_cnt; + dfi->dir_ordered_count = req->r_dir_ordered_cnt; } } else { dout("readdir !did_prepopulate\n"); /* disable readdir cache */ - fi->readdir_cache_idx = -1; + dfi->readdir_cache_idx = -1; /* preclude from marking dir complete */ - fi->dir_release_count = 0; + dfi->dir_release_count = 0; } /* note next offset and last dentry name */ @@ -464,19 +464,19 @@ more: rinfo->dir_entries + (rinfo->dir_nr-1); unsigned next_offset = req->r_reply_info.dir_end ? 2 : (fpos_off(rde->offset) + 1); - err = note_last_dentry(fi, rde->name, rde->name_len, + err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); if (err) return err; } else if (req->r_reply_info.dir_end) { - fi->next_offset = 2; + dfi->next_offset = 2; /* keep last name */ } } - rinfo = &fi->last_readdir->r_reply_info; + rinfo = &dfi->last_readdir->r_reply_info; dout("readdir frag %x num %d pos %llx chunk first %llx\n", - fi->frag, rinfo->dir_nr, ctx->pos, + dfi->frag, rinfo->dir_nr, ctx->pos, rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); i = 0; @@ -520,52 +520,55 @@ more: ctx->pos++; } - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; - if (fi->next_offset > 2) { - frag = fi->frag; + if (dfi->next_offset > 2) { + frag = dfi->frag; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(fi->frag)) { - frag = ceph_frag_next(fi->frag); + if (!ceph_frag_is_rightmost(dfi->frag)) { + frag = ceph_frag_next(dfi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), - fi->next_offset, true); + dfi->next_offset, true); if (new_pos > ctx->pos) ctx->pos = new_pos; /* keep last_name */ } else { - ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); - kfree(fi->last_name); - fi->last_name = NULL; + ctx->pos = ceph_make_fpos(frag, dfi->next_offset, + false); + kfree(dfi->last_name); + dfi->last_name = NULL; } dout("readdir next frag is %x\n", frag); goto more; } - fi->flags |= CEPH_F_ATEND; + dfi->file_info.flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + if (atomic64_read(&ci->i_release_count) == + dfi->dir_release_count) { spin_lock(&ci->i_ceph_lock); - if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { + if (dfi->dir_ordered_count == + atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); /* use i_size to track number of entries in * readdir cache */ - BUG_ON(fi->readdir_cache_idx < 0); - i_size_write(inode, fi->readdir_cache_idx * + BUG_ON(dfi->readdir_cache_idx < 0); + i_size_write(inode, dfi->readdir_cache_idx * sizeof(struct dentry*)); } else { dout(" marking %p complete\n", inode); } - __ceph_dir_set_complete(ci, fi->dir_release_count, - fi->dir_ordered_count); + __ceph_dir_set_complete(ci, dfi->dir_release_count, + dfi->dir_ordered_count); spin_unlock(&ci->i_ceph_lock); } @@ -573,25 +576,25 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_dir_file_info *dfi) { - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; } - kfree(fi->last_name); - fi->last_name = NULL; - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - fi->next_offset = 2; /* compensate for . and .. */ - fi->flags &= ~CEPH_F_ATEND; + kfree(dfi->last_name); + dfi->last_name = NULL; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + dfi->next_offset = 2; /* compensate for . and .. */ + dfi->file_info.flags &= ~CEPH_F_ATEND; } /* * discard buffered readdir content on seekdir(0), or seek to new frag, * or seek prior to current chunk */ -static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) { struct ceph_mds_reply_info_parsed *rinfo; loff_t chunk_offset; @@ -600,10 +603,10 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag != fpos_frag(new_pos)) { + } else if (dfi->frag != fpos_frag(new_pos)) { return true; } - rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL; if (!rinfo || !rinfo->dir_nr) return true; chunk_offset = rinfo->dir_entries[0].offset; @@ -613,7 +616,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file->f_mapping->host; loff_t retval; @@ -631,20 +634,20 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { - if (need_reset_readdir(fi, offset)) { + if (need_reset_readdir(dfi, offset)) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(dfi); } else if (is_hash_order(offset) && offset > file->f_pos) { /* for hash offset, we don't know if a forward seek * is within same frag */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; } if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->flags &= ~CEPH_F_ATEND; + dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; } @@ -1352,7 +1355,7 @@ static void ceph_d_prune(struct dentry *dentry) static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; @@ -1361,12 +1364,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; - if (!fi->dir_info) { - fi->dir_info = kmalloc(bufsize, GFP_KERNEL); - if (!fi->dir_info) + if (!dfi->dir_info) { + dfi->dir_info = kmalloc(bufsize, GFP_KERNEL); + if (!dfi->dir_info) return -ENOMEM; - fi->dir_info_len = - snprintf(fi->dir_info, bufsize, + dfi->dir_info_len = + snprintf(dfi->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" @@ -1386,10 +1389,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, (long)ci->i_rctime.tv_nsec); } - if (*ppos >= fi->dir_info_len) + if (*ppos >= dfi->dir_info_len) return 0; - size = min_t(unsigned, size, fi->dir_info_len-*ppos); - left = copy_to_user(buf, fi->dir_info + *ppos, size); + size = min_t(unsigned, size, dfi->dir_info_len-*ppos); + left = copy_to_user(buf, dfi->dir_info + *ppos, size); if (left == size) return -EFAULT; *ppos += (size - left); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a1f0aee29c27..4a92acba1e9c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -161,13 +161,50 @@ out: return req; } +static int ceph_init_file_info(struct inode *inode, struct file *file, + int fmode, bool isdir) +{ + struct ceph_file_info *fi; + + dout("%s %p %p 0%o (%s)\n", __func__, inode, file, + inode->i_mode, isdir ? "dir" : "regular"); + BUG_ON(inode->i_fop->release != ceph_release); + + if (isdir) { + struct ceph_dir_file_info *dfi = + kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); + if (!dfi) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + + file->private_data = dfi; + fi = &dfi->file_info; + dfi->next_offset = 2; + dfi->readdir_cache_idx = -1; + } else { + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); + if (!fi) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + + file->private_data = fi; + } + + fi->fmode = fmode; + spin_lock_init(&fi->rw_contexts_lock); + INIT_LIST_HEAD(&fi->rw_contexts); + + return 0; +} + /* * initialize private struct file data. * if we fail, clean up by dropping fmode reference on the ceph_inode */ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { - struct ceph_file_info *fi; int ret = 0; switch (inode->i_mode & S_IFMT) { @@ -175,22 +212,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: - dout("init_file %p %p 0%o (regular)\n", inode, file, - inode->i_mode); - fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ - return -ENOMEM; - } - fi->fmode = fmode; - - spin_lock_init(&fi->rw_contexts_lock); - INIT_LIST_HEAD(&fi->rw_contexts); - - fi->next_offset = 2; - fi->readdir_cache_idx = -1; - file->private_data = fi; - BUG_ON(inode->i_fop->release != ceph_release); + ret = ceph_init_file_info(inode, file, fmode, + S_ISDIR(inode->i_mode)); + if (ret) + return ret; break; case S_IFLNK: @@ -462,16 +487,27 @@ out_acl: int ceph_release(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *fi = file->private_data; - dout("release inode %p file %p\n", inode, file); - ceph_put_fmode(ci, fi->fmode); - if (fi->last_readdir) - ceph_mdsc_put_request(fi->last_readdir); - kfree(fi->last_name); - kfree(fi->dir_info); - WARN_ON(!list_empty(&fi->rw_contexts)); - kmem_cache_free(ceph_file_cachep, fi); + if (S_ISDIR(inode->i_mode)) { + struct ceph_dir_file_info *dfi = file->private_data; + dout("release inode %p dir file %p\n", inode, file); + WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); + + ceph_put_fmode(ci, dfi->file_info.fmode); + + if (dfi->last_readdir) + ceph_mdsc_put_request(dfi->last_readdir); + kfree(dfi->last_name); + kfree(dfi->dir_info); + kmem_cache_free(ceph_dir_file_cachep, dfi); + } else { + struct ceph_file_info *fi = file->private_data; + dout("release inode %p regular file %p\n", inode, file); + WARN_ON(!list_empty(&fi->rw_contexts)); + + ceph_put_fmode(ci, fi->fmode); + kmem_cache_free(ceph_file_cachep, fi); + } /* wake up anyone waiting for caps on this inode */ wake_up_all(&ci->i_cap_wq); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9bf9e54259dd..0fc03c456c50 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -679,6 +679,7 @@ struct kmem_cache *ceph_cap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; +struct kmem_cache *ceph_dir_file_cachep; static void ceph_inode_init_once(void *foo) { @@ -715,6 +716,10 @@ static int __init init_caches(void) if (!ceph_file_cachep) goto bad_file; + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); + if (!ceph_dir_file_cachep) + goto bad_dir_file; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -722,6 +727,8 @@ static int __init init_caches(void) return 0; bad_fscache: + kmem_cache_destroy(ceph_dir_file_cachep); +bad_dir_file: kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); @@ -747,6 +754,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + kmem_cache_destroy(ceph_dir_file_cachep); ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1c2086e0fec2..ff49433014e9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -671,6 +671,10 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; +}; + +struct ceph_dir_file_info { + struct ceph_file_info file_info; /* readdir: position within the dir */ u32 frag; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c2ec44cf5098..49c93b9308d7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -262,6 +262,7 @@ extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; +extern struct kmem_cache *ceph_dir_file_cachep; /* ceph_common.c */ extern bool libceph_compatible(void *data); -- cgit v1.2.3-59-g8ed1b From fb18a57568c2b84cd611e242c0f6fa97b45e4907 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 5 Jan 2018 10:47:18 +0000 Subject: ceph: quota: add initial infrastructure to support cephfs quotas This patch adds the infrastructure required to support cephfs quotas as it is currently implemented in the ceph fuse client. Cephfs quotas can be set on any directory, and can restrict the number of bytes or the number of files stored beneath that point in the directory hierarchy. Quotas are set using the extended attributes 'ceph.quota.max_files' and 'ceph.quota.max_bytes', and can be removed by setting these attributes to '0'. Link: http://tracker.ceph.com/issues/22372 Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.txt | 12 +++++++ fs/ceph/Makefile | 2 +- fs/ceph/inode.c | 6 ++++ fs/ceph/mds_client.c | 23 ++++++++++++++ fs/ceph/mds_client.h | 2 ++ fs/ceph/quota.c | 65 ++++++++++++++++++++++++++++++++++++++ fs/ceph/super.h | 8 +++++ fs/ceph/xattr.c | 44 ++++++++++++++++++++++++++ include/linux/ceph/ceph_features.h | 1 + include/linux/ceph/ceph_fs.h | 17 ++++++++++ net/ceph/ceph_common.c | 1 + 11 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 fs/ceph/quota.c (limited to 'include/linux') diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 0b302a11718a..094772481263 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -62,6 +62,18 @@ subdirectories, and a summation of all nested file sizes. This makes the identification of large disk space consumers relatively quick, as no 'du' or similar recursive scan of the file system is required. +Finally, Ceph also allows quotas to be set on any directory in the system. +The quota can restrict the number of bytes or the number of files stored +beneath that point in the directory hierarchy. Quotas can be set using +extended attributes 'ceph.quota.max_files' and 'ceph.quota.max_bytes', eg: + + setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir + getfattr -n ceph.quota.max_bytes /some/dir + +A limitation of the current quotas implementation is that it relies on the +cooperation of the client mounting the file system to stop writers when a +limit is reached. A modified or adversarial client cannot be prevented +from writing as much data as it needs. Mount Syntax ============ diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 174f5709e508..a699e320393f 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o \ + export.o caps.o snap.o xattr.o quota.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index be5f12d0d637..2c6f8be4ed63 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -441,6 +441,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb) atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; + ci->i_max_bytes = 0; + ci->i_max_files = 0; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); @@ -790,6 +793,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + ci->i_max_bytes = iinfo->max_bytes; + ci->i_max_files = iinfo->max_files; + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 537048b4a4d5..1c9877c1149f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -100,6 +100,26 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + if (features & CEPH_FEATURE_MDS_QUOTA) { + u8 struct_v, struct_compat; + u32 struct_len; + + /* + * both struct_v and struct_compat are expected to be >= 1 + */ + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + if (!struct_v || !struct_compat) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + } else { + info->max_bytes = 0; + info->max_files = 0; + } + info->pool_ns_len = 0; info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { @@ -4082,6 +4102,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_CLIENT_LEASE: handle_lease(mdsc, s, msg); break; + case CEPH_MSG_CLIENT_QUOTA: + ceph_handle_quota(mdsc, s, msg); + break; default: pr_err("received unknown message type %d %s\n", type, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 71e3b783ee6f..2a67c8b01ae6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -49,6 +49,8 @@ struct ceph_mds_reply_info_in { char *inline_data; u32 pool_ns_len; char *pool_ns_data; + u64 max_bytes; + u64 max_files; }; struct ceph_mds_reply_dir_entry { diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c new file mode 100644 index 000000000000..1b69d8365ec2 --- /dev/null +++ b/fs/ceph/quota.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * quota.c - CephFS quota + * + * Copyright (C) 2017-2018 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "super.h" +#include "mds_client.h" + +void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct ceph_mds_quota *h = msg->front.iov_base; + struct ceph_vino vino; + struct inode *inode; + struct ceph_inode_info *ci; + + if (msg->front.iov_len != sizeof(*h)) { + pr_err("%s corrupt message mds%d len %d\n", __func__, + session->s_mds, (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; + } + + /* increment msg sequence number */ + mutex_lock(&session->s_mutex); + session->s_seq++; + mutex_unlock(&session->s_mutex); + + /* lookup inode */ + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + pr_warn("Failed to find inode %llu\n", vino.ino); + return; + } + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + ci->i_rbytes = le64_to_cpu(h->rbytes); + ci->i_rfiles = le64_to_cpu(h->rfiles); + ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); + ci->i_max_bytes = le64_to_cpu(h->max_bytes); + ci->i_max_files = le64_to_cpu(h->max_files); + spin_unlock(&ci->i_ceph_lock); + + iput(inode); +} diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ff49433014e9..0c95a929bab7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -310,6 +310,9 @@ struct ceph_inode_info { u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; + /* quotas */ + u64 i_max_bytes, i_max_files; + struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; @@ -1070,4 +1073,9 @@ extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); +/* quota.c */ +extern void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); + #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index e1c4e0b12b4c..7e72348639e4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -224,6 +224,31 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, (long)ci->i_rctime.tv_nsec); } +/* quotas */ + +static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) +{ + return (ci->i_max_files || ci->i_max_bytes); +} + +static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "max_bytes=%llu max_files=%llu", + ci->i_max_bytes, ci->i_max_files); +} + +static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%llu", ci->i_max_bytes); +} + +static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%llu", ci->i_max_files); +} #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ @@ -247,6 +272,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, .hidden = true, \ .exists_cb = ceph_vxattrcb_layout_exists, \ } +#define XATTR_QUOTA_FIELD(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_quota_exists, \ + } static struct ceph_vxattr ceph_dir_vxattrs[] = { { @@ -270,6 +304,16 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rctime), + { + .name = "ceph.quota", + .name_size = sizeof("ceph.quota"), + .getxattr_cb = ceph_vxattrcb_quota, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_quota_exists, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 59042d5ac520..3901927cf6a0 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -204,6 +204,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_OSD_POOLRESEND | \ + CEPH_FEATURE_MDS_QUOTA | \ CEPH_FEATURE_CRUSH_V4 | \ CEPH_FEATURE_NEW_OSDOP_ENCODING | \ CEPH_FEATURE_SERVER_JEWEL | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 88dd51381aaf..7ecfc88314d8 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -134,6 +134,7 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +#define CEPH_MSG_CLIENT_QUOTA 0x314 /* pool ops */ #define CEPH_MSG_POOLOP_REPLY 48 @@ -807,4 +808,20 @@ struct ceph_mds_snap_realm { } __attribute__ ((packed)); /* followed by my snap list, then prior parent snap list */ +/* + * quotas + */ +struct ceph_mds_quota { + __le64 ino; /* ino */ + struct ceph_timespec rctime; + __le64 rbytes; /* dir stats */ + __le64 rfiles; + __le64 rsubdirs; + __u8 struct_v; /* compat */ + __u8 struct_compat; + __le32 struct_len; + __le64 max_bytes; /* quota max. bytes */ + __le64 max_files; /* quota max. files */ +} __attribute__ ((packed)); + #endif diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index c15e2699090c..ffbcc7f5e740 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -80,6 +80,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_CLIENT_REPLY: return "client_reply"; case CEPH_MSG_CLIENT_CAPS: return "client_caps"; case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_QUOTA: return "client_quota"; case CEPH_MSG_CLIENT_SNAP: return "client_snap"; case CEPH_MSG_CLIENT_LEASE: return "client_lease"; case CEPH_MSG_POOLOP_REPLY: return "poolop_reply"; -- cgit v1.2.3-59-g8ed1b