From 182fac2689b769a96e7fc9defcd560c5cca92b1e Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 29 Feb 2012 08:30:58 -0700 Subject: net/ceph: Only clear SOCK_NOSPACE when there is sufficient space in the socket buffer The Ceph messenger would sometimes queue multiple work items to write data to a socket when the socket buffer was full. Fix this problem by making ceph_write_space() use SOCK_NOSPACE in the same way that net/core/stream.c:sk_stream_write_space() does, i.e., clearing it only when sufficient space is available in the socket buffer. Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- net/ceph/messenger.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ad5b70801f37..d11f91b05452 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -143,16 +143,22 @@ static void ceph_write_space(struct sock *sk) struct ceph_connection *con = (struct ceph_connection *)sk->sk_user_data; - /* only queue to workqueue if there is data we want to write. */ + /* only queue to workqueue if there is data we want to write, + * and there is sufficient space in the socket buffer to accept + * more data. clear SOCK_NOSPACE so that ceph_write_space() + * doesn't get called again until try_write() fills the socket + * buffer. See net/ipv4/tcp_input.c:tcp_check_space() + * and net/core/stream.c:sk_stream_write_space(). + */ if (test_bit(WRITE_PENDING, &con->state)) { - dout("ceph_write_space %p queueing write work\n", con); - queue_con(con); + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + dout("ceph_write_space %p queueing write work\n", con); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_con(con); + } } else { dout("ceph_write_space %p nothing to write\n", con); } - - /* since we have our own write_space, clear the SOCK_NOSPACE flag */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); } /* socket's state has changed */ -- cgit v1.2.3-59-g8ed1b From 64486697771cbe219fffcb5c8e2ed9ca4fdf086c Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 16 Feb 2012 11:55:48 -0500 Subject: libceph: fix overflow check in crush_decode() The existing overflow check (n > ULONG_MAX / b) didn't work, because n = ULONG_MAX / b would both bypass the check and still overflow the allocation size a + n * b. The correct check should be (n > (ULONG_MAX - a) / b). Signed-off-by: Xi Wang Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index fd863fe76934..29ad46ec9dcf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -283,7 +283,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 err = -EINVAL; - if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) + if (yes > (ULONG_MAX - sizeof(*r)) + / sizeof(struct crush_rule_step)) goto bad; #endif r = c->rules[i] = kmalloc(sizeof(*r) + -- cgit v1.2.3-59-g8ed1b From 5766651971e81298732466c9aa462ff47898ba37 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: ceph: use a shared zero page rather than one per messenger Each messenger allocates a page to be used when writing zeroes out in the event of error or other abnormal condition. Instead, use the kernel ZERO_PAGE() for that purpose. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 1 - net/ceph/messenger.c | 43 ++++++++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ffbeb2c217b4..6b5af5f976d1 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -54,7 +54,6 @@ struct ceph_connection_operations { struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ struct ceph_entity_addr my_enc_addr; - struct page *zero_page; /* used in certain error cases */ bool nocrc; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d11f91b05452..738356255e0b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -52,6 +52,9 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; +static struct page *zero_page; /* used in certain error cases */ +static void *zero_page_address; /* kernel virtual addr of zero_page */ + const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; @@ -99,18 +102,41 @@ struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { + BUG_ON(zero_page != NULL); + zero_page = ZERO_PAGE(0); + page_cache_get(zero_page); + + BUG_ON(zero_page_address != NULL); + zero_page_address = kmap(zero_page); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (!ceph_msgr_wq) { pr_err("msgr_init failed to create workqueue\n"); + + zero_page_address = NULL; + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; + return -ENOMEM; } + return 0; } EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { + BUG_ON(ceph_msgr_wq == NULL); destroy_workqueue(ceph_msgr_wq); + + BUG_ON(zero_page_address == NULL); + zero_page_address = NULL; + + BUG_ON(zero_page == NULL); + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; } EXPORT_SYMBOL(ceph_msgr_exit); @@ -841,9 +867,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) max_write = bv->bv_len; #endif } else { - page = con->msgr->zero_page; + page = zero_page; if (crc) - kaddr = page_address(con->msgr->zero_page); + kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -914,7 +940,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { struct kvec iov = { - .iov_base = page_address(con->msgr->zero_page), + .iov_base = zero_page_address, .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) }; @@ -2222,15 +2248,6 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, spin_lock_init(&msgr->global_seq_lock); - /* the zero page is needed if a request is "canceled" while the message - * is being written over the socket */ - msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); - if (!msgr->zero_page) { - kfree(msgr); - return ERR_PTR(-ENOMEM); - } - kmap(msgr->zero_page); - if (myaddr) msgr->inst.addr = *myaddr; @@ -2247,8 +2264,6 @@ EXPORT_SYMBOL(ceph_messenger_create); void ceph_messenger_destroy(struct ceph_messenger *msgr) { dout("destroy %p\n", msgr); - kunmap(msgr->zero_page); - __free_page(msgr->zero_page); kfree(msgr); dout("destroyed messenger %p\n", msgr); } -- cgit v1.2.3-59-g8ed1b From a5bc3129a296fd4663c3ef0be5575e82453739dd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: ceph: make use of "else" where appropriate Rearrange ceph_tcp_connect() a bit, making use of "else" rather than re-testing a value with consecutive "if" statements. Don't record a connection's socket pointer unless the connect operation is successful. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 738356255e0b..b5536e4e39a1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -251,7 +251,6 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); - con->sock = sock; sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP @@ -268,18 +267,16 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); - ret = 0; - } - if (ret < 0) { + } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->sock = NULL; con->error_msg = "connect error"; - } - if (ret < 0) return ERR_PTR(ret); + } + con->sock = sock; + return sock; } -- cgit v1.2.3-59-g8ed1b From f64a93172b97dcfcfa68f595652220653562f605 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: ceph: kill addr_str_lock spinlock; use atomic instead A spinlock is used to protect a value used for selecting an array index for a string used for formatting a socket address for human consumption. The index is reset to 0 if it ever reaches the maximum index value. Instead, use an ever-increasing atomic variable as a sequence number, and compute the array index by masking off all but the sequence number's lowest bits. Make the number of entries in the array a power of two to allow the use of such a mask (to avoid jumps in the index value when the sequence number wraps). The length of these strings is somewhat arbitrarily set at 60 bytes. The worst-case length of a string produced is 54 bytes, for an IPv6 address that can't be shortened, e.g.: [1234:5678:9abc:def0:1111:2222:123.234.210.100]:32767 Change it so we arbitrarily use 64 bytes instead; if nothing else it will make the array of these line up better in hex dumps. Rename a few things to reinforce the distinction between the number of strings in the array and the length of individual strings. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b5536e4e39a1..e86bb3f14859 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -44,13 +44,16 @@ static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); /* - * nicely render a sockaddr as a string. + * Nicely render a sockaddr as a string. An array of formatted + * strings is used, to approximate reentrancy. */ -#define MAX_ADDR_STR 20 -#define MAX_ADDR_STR_LEN 60 -static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; -static DEFINE_SPINLOCK(addr_str_lock); -static int last_addr_str; +#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ +#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) +#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) +#define MAX_ADDR_STR_LEN 64 /* 54 is enough */ + +static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; +static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ static void *zero_page_address; /* kernel virtual addr of zero_page */ @@ -62,11 +65,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; - spin_lock(&addr_str_lock); - i = last_addr_str++; - if (last_addr_str == MAX_ADDR_STR) - last_addr_str = 0; - spin_unlock(&addr_str_lock); + i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss->ss_family) { -- cgit v1.2.3-59-g8ed1b From bd406145129e8724cc71b65ff2a788dbd4d60c50 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: ceph: eliminate some needless casts This eliminates type casts in some places where they are not required. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e86bb3f14859..09a412ba4b70 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -70,13 +70,13 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, + ntohs(in4->sin_port)); break; case AF_INET6: - snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, + ntohs(in6->sin6_port)); break; default: @@ -153,8 +153,8 @@ EXPORT_SYMBOL(ceph_msgr_flush); /* data available on socket, or listen socket received a connect */ static void ceph_data_ready(struct sock *sk, int count_unused) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; + if (sk->sk_state != TCP_CLOSE_WAIT) { dout("ceph_data_ready on %p state = %lu, queueing work\n", con, con->state); @@ -189,8 +189,7 @@ static void ceph_write_space(struct sock *sk) /* socket's state has changed */ static void ceph_state_change(struct sock *sk) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; dout("ceph_state_change %p state = %lu sk_state = %u\n", con, con->state, sk->sk_state); @@ -225,7 +224,7 @@ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; - sk->sk_user_data = (void *)con; + sk->sk_user_data = con; sk->sk_data_ready = ceph_data_ready; sk->sk_write_space = ceph_write_space; sk->sk_state_change = ceph_state_change; @@ -552,7 +551,7 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in crc (except data pages), footer */ con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, (void *)&m->hdr, + cpu_to_le32(crc32c(0, &m->hdr, sizeof(m->hdr) - sizeof(m->hdr.crc))); con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; con->out_msg->footer.front_crc = @@ -1647,7 +1646,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; con->in_base_pos += ret; if (con->in_base_pos == sizeof(con->in_hdr)) { - u32 crc = crc32c(0, (void *)&con->in_hdr, + u32 crc = crc32c(0, &con->in_hdr, sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); if (crc != le32_to_cpu(con->in_hdr.crc)) { pr_err("read_partial_message bad hdr " -- cgit v1.2.3-59-g8ed1b From 99f0f3b2c4be15784bb4ede33b5f2c3f7861dba7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 23 Jan 2012 15:49:27 -0600 Subject: ceph: eliminate some abusive casts This fixes some spots where a type cast to (void *) was used as as a universal type hiding mechanism. Instead, properly cast the type to the intended target type. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 09a412ba4b70..3917847ad8e5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -62,8 +62,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; @@ -1112,8 +1112,8 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, char delim, const char **ipend) { - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; memset(ss, 0, sizeof(*ss)); -- cgit v1.2.3-59-g8ed1b From ee57741c5209154b8ef124bcaa2496da1b69a988 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 24 Jan 2012 10:08:36 -0600 Subject: rbd: make ceph_parse_options() return a pointer ceph_parse_options() takes the address of a pointer as an argument and uses it to return the address of an allocated structure if successful. With this interface is not evident at call sites that the pointer is always initialized. Change the interface to return the address instead (or a pointer-coded error code) to make the validity of the returned pointer obvious. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 ++++-- fs/ceph/super.c | 6 ++++-- include/linux/ceph/libceph.h | 2 +- net/ceph/ceph_common.c | 16 ++++++++-------- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b9371f0b9532..ed6711e35323 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -371,11 +371,13 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - ret = ceph_parse_options(&opt, options, mon_addr, + opt = ceph_parse_options(options, mon_addr, mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); - if (ret < 0) + if (IS_ERR(opt)) { + ret = PTR_ERR(opt); goto done_err; + } spin_lock(&node_lock); rbdc = __rbd_client_find(opt); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3da3b32bdde..4fab1fdcfa6a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -334,10 +334,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, *path += 2; dout("server path '%s'\n", *path); - err = ceph_parse_options(popt, options, dev_name, dev_name_end, + *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); - if (err) + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); goto out; + } /* success */ *pfsopt = fsopt; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 95bd8502e715..92eef7c3d3c5 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -207,7 +207,7 @@ extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; -extern int ceph_parse_options(struct ceph_options **popt, char *options, +extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 761ad9d6cc3b..621c3221b393 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -277,10 +277,11 @@ out: return err; } -int ceph_parse_options(struct ceph_options **popt, char *options, - const char *dev_name, const char *dev_name_end, - int (*parse_extra_token)(char *c, void *private), - void *private) +struct ceph_options * +ceph_parse_options(char *options, const char *dev_name, + const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) { struct ceph_options *opt; const char *c; @@ -289,7 +290,7 @@ int ceph_parse_options(struct ceph_options **popt, char *options, opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) - return err; + return ERR_PTR(-ENOMEM); opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) @@ -412,12 +413,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options, } /* success */ - *popt = opt; - return 0; + return opt; out: ceph_destroy_options(opt); - return err; + return ERR_PTR(err); } EXPORT_SYMBOL(ceph_parse_options); -- cgit v1.2.3-59-g8ed1b From 963be4d7709f84d865f76d12d5b0ec7edad1c498 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: move prepare_write_banner() One of the arguments to prepare_write_connect() indicates whether it is being called immediately after a call to prepare_write_banner(). Move the prepare_write_banner() call inside prepare_write_connect(), and reinterpret (and rename) the "after_banner" argument so it indicates that prepare_write_connect() should *make* the call rather than should know it has already been made. This was split out from the next patch to highlight this change in logic. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3917847ad8e5..0665945b6468 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -676,7 +676,7 @@ static void prepare_write_banner(struct ceph_messenger *msgr, static int prepare_write_connect(struct ceph_messenger *msgr, struct ceph_connection *con, - int after_banner) + int include_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -705,7 +705,9 @@ static int prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - if (!after_banner) { + if (include_banner) + prepare_write_banner(msgr, con); + else { con->out_kvec_left = 0; con->out_kvec_bytes = 0; } @@ -1846,7 +1848,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); set_bit(CONNECTING, &con->state); -- cgit v1.2.3-59-g8ed1b From 859eb7994876f26fd9f52d9589fbcab8e2fe8069 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: encapsulate connection kvec operations Encapsulate the operation of adding a new chunk of data to the next open slot in a ceph_connection's out_kvec array. Also add a "reset" operation to make subsequent add operations start at the beginning of the array again. Use these routines throughout, avoiding duplicate code and ensuring all calls are handled consistently. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 117 ++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0665945b6468..04d2b975ab0c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -469,14 +469,35 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } +static void ceph_con_out_kvec_reset(struct ceph_connection *con) +{ + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + con->out_kvec_cur = &con->out_kvec[0]; +} + +static void ceph_con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index; + + index = con->out_kvec_left; + BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + + con->out_kvec[index].iov_len = size; + con->out_kvec[index].iov_base = data; + con->out_kvec_left++; + con->out_kvec_bytes += size; +} /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con, int v) +static void prepare_write_message_footer(struct ceph_connection *con) { struct ceph_msg *m = con->out_msg; + int v = con->out_kvec_left; dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; @@ -494,9 +515,8 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v) static void prepare_write_message(struct ceph_connection *con) { struct ceph_msg *m; - int v = 0; - con->out_kvec_bytes = 0; + ceph_con_out_kvec_reset(con); con->out_kvec_is_msg = true; con->out_msg_done = false; @@ -504,16 +524,13 @@ static void prepare_write_message(struct ceph_connection *con) * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; - con->out_kvec[v].iov_base = &tag_ack; - con->out_kvec[v++].iov_len = 1; + ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[v].iov_base = &con->out_temp_ack; - con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); } - m = list_first_entry(&con->out_queue, - struct ceph_msg, list_head); + m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; /* put message on sent list */ @@ -537,17 +554,13 @@ static void prepare_write_message(struct ceph_connection *con) BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ - con->out_kvec[v].iov_base = &tag_msg; - con->out_kvec[v++].iov_len = 1; - con->out_kvec[v].iov_base = &m->hdr; - con->out_kvec[v++].iov_len = sizeof(m->hdr); - con->out_kvec[v++] = m->front; + ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + if (m->middle) - con->out_kvec[v++] = m->middle->vec; - con->out_kvec_left = v; - con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + - (m->middle ? m->middle->vec.iov_len : 0); - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ con->out_msg->hdr.crc = @@ -580,7 +593,7 @@ static void prepare_write_message(struct ceph_connection *con) con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con, v); + prepare_write_message_footer(con); } set_bit(WRITE_PENDING, &con->state); @@ -595,14 +608,14 @@ static void prepare_write_ack(struct ceph_connection *con) con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; - con->out_kvec[0].iov_base = &tag_ack; - con->out_kvec[0].iov_len = 1; + ceph_con_out_kvec_reset(con); + + ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[1].iov_base = &con->out_temp_ack; - con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_left = 2; - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + con->out_more = 1; /* more will follow.. eventually.. */ set_bit(WRITE_PENDING, &con->state); } @@ -613,11 +626,8 @@ static void prepare_write_ack(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); - con->out_kvec[0].iov_base = &tag_keepalive; - con->out_kvec[0].iov_len = 1; - con->out_kvec_left = 1; - con->out_kvec_bytes = 1; - con->out_kvec_cur = con->out_kvec; + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); set_bit(WRITE_PENDING, &con->state); } @@ -646,12 +656,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con) con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - if (auth_len) { - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; - } + if (auth_len) + ceph_con_out_kvec_add(con, auth_len, auth_buf); + return 0; } @@ -661,15 +668,11 @@ static int prepare_connect_authorizer(struct ceph_connection *con) static void prepare_write_banner(struct ceph_messenger *msgr, struct ceph_connection *con) { - int len = strlen(CEPH_BANNER); + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr), + &msgr->my_enc_addr); - con->out_kvec[0].iov_base = CEPH_BANNER; - con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->my_enc_addr; - con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); - con->out_kvec_left = 2; - con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); - con->out_kvec_cur = con->out_kvec; con->out_more = 0; set_bit(WRITE_PENDING, &con->state); } @@ -707,22 +710,16 @@ static int prepare_write_connect(struct ceph_messenger *msgr, if (include_banner) prepare_write_banner(msgr, con); - else { - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - } - con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; - con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); - con->out_kvec_left++; - con->out_kvec_bytes += sizeof(con->out_connect); - con->out_kvec_cur = con->out_kvec; + else + ceph_con_out_kvec_reset(con); + ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); + con->out_more = 0; set_bit(WRITE_PENDING, &con->state); return prepare_connect_authorizer(con); } - /* * write as much of pending kvecs to the socket as we can. * 1 -> done @@ -919,10 +916,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) /* prepare and queue up footer, too */ if (!crc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con->out_kvec_bytes = 0; - con->out_kvec_left = 0; - con->out_kvec_cur = con->out_kvec; - prepare_write_message_footer(con, 0); + ceph_con_out_kvec_reset(con); + prepare_write_message_footer(con); ret = 1; out: return ret; -- cgit v1.2.3-59-g8ed1b From e0f43c9419c1900e5b50de4261e9686a45a0a2b8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: make ceph_msgr_wq private The messenger workqueue has no need to be public. So give it static scope. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 2 -- net/ceph/messenger.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6b5af5f976d1..5ca0f8244203 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -14,8 +14,6 @@ struct ceph_msg; struct ceph_connection; -extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ - /* * Ceph defines these callbacks for handling connection events. */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 04d2b975ab0c..31f59ac03d8a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -97,7 +97,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) /* * work queue for all reading and writing to/from the socket. */ -struct workqueue_struct *ceph_msgr_wq; +static struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { -- cgit v1.2.3-59-g8ed1b From 6173d1f02fb19c0fba02857ae4e1109b5ec95034 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: encapsulate some messenger cleanup code Define a helper function to perform various cleanup operations. Use it both in the exit routine and in the init routine in the event of an error. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 31f59ac03d8a..c3023a600ad2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -99,6 +99,20 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; +void _ceph_msgr_exit(void) +{ + if (ceph_msgr_wq) + destroy_workqueue(ceph_msgr_wq); + + BUG_ON(zero_page_address == NULL); + zero_page_address = NULL; + + BUG_ON(zero_page == NULL); + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; +} + int ceph_msgr_init(void) { BUG_ON(zero_page != NULL); @@ -109,33 +123,21 @@ int ceph_msgr_init(void) zero_page_address = kmap(zero_page); ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); - if (!ceph_msgr_wq) { - pr_err("msgr_init failed to create workqueue\n"); - - zero_page_address = NULL; - kunmap(zero_page); - page_cache_release(zero_page); - zero_page = NULL; + if (ceph_msgr_wq) + return 0; - return -ENOMEM; - } + pr_err("msgr_init failed to create workqueue\n"); + _ceph_msgr_exit(); - return 0; + return -ENOMEM; } EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); - destroy_workqueue(ceph_msgr_wq); - BUG_ON(zero_page_address == NULL); - zero_page_address = NULL; - - BUG_ON(zero_page == NULL); - kunmap(zero_page); - page_cache_release(zero_page); - zero_page = NULL; + _ceph_msgr_exit(); } EXPORT_SYMBOL(ceph_msgr_exit); -- cgit v1.2.3-59-g8ed1b From 41617d0c9c9832e030667277ddf6b4ffb4ecdc90 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: make ceph_tcp_connect() return int There is no real need for ceph_tcp_connect() to return the socket pointer it creates, since it already assigns it to con->sock, which is visible to the caller. Instead, have it return an error code, which tidies things up a bit. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c3023a600ad2..e1e53bb2d0cf 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -240,7 +240,7 @@ static void set_sock_callbacks(struct socket *sock, /* * initiate connection to a remote socket. */ -static struct socket *ceph_tcp_connect(struct ceph_connection *con) +static int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; @@ -250,7 +250,7 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) - return ERR_PTR(ret); + return ret; sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP @@ -273,11 +273,11 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) sock_release(sock); con->error_msg = "connect error"; - return ERR_PTR(ret); + return ret; } con->sock = sock; - return sock; + return 0; } static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) @@ -1854,11 +1854,9 @@ more: con->in_tag = CEPH_MSGR_TAG_READY; dout("try_write initiating connect on %p new state %lu\n", con, con->state); - con->sock = ceph_tcp_connect(con); - if (IS_ERR(con->sock)) { - con->sock = NULL; + ret = ceph_tcp_connect(con); + if (ret < 0) { con->error_msg = "connect error"; - ret = -1; goto out; } } -- cgit v1.2.3-59-g8ed1b From d3002b974cefbb7c1e325cc296966f768ff76b06 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 14 Feb 2012 14:05:33 -0600 Subject: libceph: a few small changes This gathers a number of very minor changes: - use %hu when formatting the a socket address's address family - null out the ceph_msgr_wq pointer after the queue has been destroyed - drop a needless cast in ceph_write_space() - add a WARN() call in ceph_state_change() in the event an unrecognized socket state is encountered - rearrange the logic in ceph_con_get() and ceph_con_put() so that: - the reference counts are only atomically read once - the values displayed via dout() calls are known to be meaningful at the time they are formatted Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e1e53bb2d0cf..44d8c77cabdd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -80,8 +80,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", - (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", + ss->ss_family); } return s; @@ -101,8 +101,10 @@ static struct workqueue_struct *ceph_msgr_wq; void _ceph_msgr_exit(void) { - if (ceph_msgr_wq) + if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); + ceph_msgr_wq = NULL; + } BUG_ON(zero_page_address == NULL); zero_page_address = NULL; @@ -167,8 +169,7 @@ static void ceph_data_ready(struct sock *sk, int count_unused) /* socket has buffer space for writing */ static void ceph_write_space(struct sock *sk) { - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; + struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept @@ -216,6 +217,8 @@ static void ceph_state_change(struct sock *sk) dout("ceph_state_change TCP_ESTABLISHED\n"); queue_con(con); break; + default: /* Everything else is uninteresting */ + break; } } @@ -420,22 +423,23 @@ bool ceph_con_opened(struct ceph_connection *con) */ struct ceph_connection *ceph_con_get(struct ceph_connection *con) { - dout("con_get %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) + 1); - if (atomic_inc_not_zero(&con->nref)) - return con; - return NULL; + int nref = __atomic_add_unless(&con->nref, 1, 0); + + dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); + + return nref ? con : NULL; } void ceph_con_put(struct ceph_connection *con) { - dout("con_put %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) - 1); - BUG_ON(atomic_read(&con->nref) == 0); - if (atomic_dec_and_test(&con->nref)) { + int nref = atomic_dec_return(&con->nref); + + BUG_ON(nref < 0); + if (nref == 0) { BUG_ON(con->sock); kfree(con); } + dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); } /* -- cgit v1.2.3-59-g8ed1b From cffaba15cd95d4a16eb5a6aa5c22a79f67d555ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: ceph: ensure Boolean options support both senses Many ceph-related Boolean options offer the ability to both enable and disable a feature. For all those that don't offer this, add a new option so that they do. Note that ceph_show_options()--which reports mount options currently in effect--only reports the option if it is different from the default value. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- fs/ceph/super.c | 10 ++++++++++ net/ceph/ceph_common.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4fab1fdcfa6a..3663cf0cb073 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -130,10 +130,12 @@ enum { Opt_nodirstat, Opt_rbytes, Opt_norbytes, + Opt_asyncreaddir, Opt_noasyncreaddir, Opt_dcache, Opt_nodcache, Opt_ino32, + Opt_noino32, }; static match_table_t fsopt_tokens = { @@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = { {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {Opt_dcache, "dcache"}, {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, {-1, NULL} }; @@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norbytes: fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; @@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 621c3221b393..cc913193d992 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -201,7 +201,9 @@ enum { Opt_ip, Opt_last_string, /* string args above */ + Opt_share, Opt_noshare, + Opt_crc, Opt_nocrc, }; @@ -217,7 +219,9 @@ static match_table_t opt_tokens = { {Opt_key, "key=%s"}, {Opt_ip, "ip=%s"}, /* string args above */ + {Opt_share, "share"}, {Opt_noshare, "noshare"}, + {Opt_crc, "crc"}, {Opt_nocrc, "nocrc"}, {-1, NULL} }; @@ -399,10 +403,16 @@ ceph_parse_options(char *options, const char *dev_name, opt->mount_timeout = intval; break; + case Opt_share: + opt->flags &= ~CEPH_OPT_NOSHARE; + break; case Opt_noshare: opt->flags |= CEPH_OPT_NOSHARE; break; + case Opt_crc: + opt->flags &= ~CEPH_OPT_NOCRC; + break; case Opt_nocrc: opt->flags |= CEPH_OPT_NOCRC; break; -- cgit v1.2.3-59-g8ed1b From bca064d236a2e3162a07c758855221bcbe3c475b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: libceph: use "do" in CRC-related Boolean variables Change the name (and type) of a few CRC-related Boolean local variables so they contain the word "do", to distingish their purpose from variables used for holding an actual CRC value. Note that in the process of doing this I identified a fairly serious logic error in write_partial_msg_pages(): the value of "do_crc" assigned appears to be the opposite of what it should be. No attempt to fix this is made here; this change preserves the erroneous behavior. The problem I found is documented here: http://tracker.newdream.net/issues/2064 Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 40 ++++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 5ca0f8244203..3bff047f6b0f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -98,7 +98,7 @@ struct ceph_msg { struct ceph_msg_pos { int page, page_pos; /* which page; offset in page */ int data_pos; /* offset in data payload */ - int did_page_crc; /* true if we've calculated crc for current page */ + bool did_page_crc; /* true if we've calculated crc for current page */ }; /* ceph connection fault delay defaults, for exponential backoff */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 44d8c77cabdd..204e229e6628 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -595,7 +595,7 @@ static void prepare_write_message(struct ceph_connection *con) else con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = 0; + con->out_msg_pos.did_page_crc = false; con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ @@ -805,7 +805,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; unsigned data_len = le32_to_cpu(msg->hdr.data_len); size_t len; - int crc = con->msgr->nocrc; + bool do_crc = con->msgr->nocrc; int ret; int total_max_write; int in_trail = 0; @@ -843,17 +843,17 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (crc) + if (do_crc) kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (crc) + if (do_crc) kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (crc) + if (do_crc) kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { @@ -862,26 +862,26 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; page_shift = bv->bv_offset; - if (crc) + if (do_crc) kaddr = kmap(page) + page_shift; max_write = bv->bv_len; #endif } else { page = zero_page; - if (crc) + if (do_crc) kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); - if (crc && !con->out_msg_pos.did_page_crc) { + if (do_crc && !con->out_msg_pos.did_page_crc) { void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); con->out_msg->footer.data_crc = cpu_to_le32(crc32c(tmpcrc, base, len)); - con->out_msg_pos.did_page_crc = 1; + con->out_msg_pos.did_page_crc = true; } ret = kernel_sendpage(con->sock, page, con->out_msg_pos.page_pos + page_shift, @@ -889,7 +889,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && + if (do_crc && (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); @@ -903,7 +903,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (ret == len) { con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = 0; + con->out_msg_pos.did_page_crc = false; if (in_trail) list_move_tail(&page->lru, &msg->trail->head); @@ -920,7 +920,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ - if (!crc) + if (!do_crc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; ceph_con_out_kvec_reset(con); prepare_write_message_footer(con); @@ -1557,7 +1557,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, - unsigned data_len, int datacrc) + unsigned data_len, bool do_datacrc) { void *p; int ret; @@ -1570,7 +1570,7 @@ static int read_partial_message_pages(struct ceph_connection *con, p = kmap(pages[con->in_msg_pos.page]); ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); - if (ret > 0 && datacrc) + if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); @@ -1590,7 +1590,7 @@ static int read_partial_message_pages(struct ceph_connection *con, #ifdef CONFIG_BLOCK static int read_partial_message_bio(struct ceph_connection *con, struct bio **bio_iter, int *bio_seg, - unsigned data_len, int datacrc) + unsigned data_len, bool do_datacrc) { struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); void *p; @@ -1606,7 +1606,7 @@ static int read_partial_message_bio(struct ceph_connection *con, ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); - if (ret > 0 && datacrc) + if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); @@ -1633,7 +1633,7 @@ static int read_partial_message(struct ceph_connection *con) int ret; int to, left; unsigned front_len, middle_len, data_len; - int datacrc = con->msgr->nocrc; + bool do_datacrc = con->msgr->nocrc; int skip; u64 seq; @@ -1744,7 +1744,7 @@ static int read_partial_message(struct ceph_connection *con) while (con->in_msg_pos.data_pos < data_len) { if (m->pages) { ret = read_partial_message_pages(con, m->pages, - data_len, datacrc); + data_len, do_datacrc); if (ret <= 0) return ret; #ifdef CONFIG_BLOCK @@ -1752,7 +1752,7 @@ static int read_partial_message(struct ceph_connection *con) ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, - data_len, datacrc); + data_len, do_datacrc); if (ret <= 0) return ret; #endif @@ -1787,7 +1787,7 @@ static int read_partial_message(struct ceph_connection *con) m, con->in_middle_crc, m->footer.middle_crc); return -EBADMSG; } - if (datacrc && + if (do_datacrc && (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { pr_err("read_partial_message %p data crc %u != exp. %u\n", m, -- cgit v1.2.3-59-g8ed1b From a9a0c51af4e7c825c014b40694571456a75ebbc4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: libceph: separate CRC calculation from byte swapping Calculate CRC in a separate step from rearranging the byte order of the result, to improve clarity and readability. Use offsetof() to determine the number of bytes to include in the CRC calculation. In read_partial_message(), switch which value gets byte-swapped, since the just-computed CRC is already likely to be in a register. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 204e229e6628..7ec6a228b667 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -521,6 +521,7 @@ static void prepare_write_message_footer(struct ceph_connection *con) static void prepare_write_message(struct ceph_connection *con) { struct ceph_msg *m; + u32 crc; ceph_con_out_kvec_reset(con); con->out_kvec_is_msg = true; @@ -569,17 +570,17 @@ static void prepare_write_message(struct ceph_connection *con) m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ - con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, &m->hdr, - sizeof(m->hdr) - sizeof(m->hdr.crc))); + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; - con->out_msg->footer.front_crc = - cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); - if (m->middle) - con->out_msg->footer.middle_crc = - cpu_to_le32(crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len)); - else + + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else con->out_msg->footer.middle_crc = 0; con->out_msg->footer.data_crc = 0; dout("prepare_write_message front_crc %u data_crc %u\n", @@ -875,12 +876,13 @@ static int write_partial_msg_pages(struct ceph_connection *con) total_max_write); if (do_crc && !con->out_msg_pos.did_page_crc) { + u32 crc; void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); - con->out_msg->footer.data_crc = - cpu_to_le32(crc32c(tmpcrc, base, len)); + crc = crc32c(tmpcrc, base, len); + con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = kernel_sendpage(con->sock, page, @@ -1650,8 +1652,9 @@ static int read_partial_message(struct ceph_connection *con) con->in_base_pos += ret; if (con->in_base_pos == sizeof(con->in_hdr)) { u32 crc = crc32c(0, &con->in_hdr, - sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); - if (crc != le32_to_cpu(con->in_hdr.crc)) { + offsetof(struct ceph_msg_header, crc)); + + if (cpu_to_le32(crc) != con->in_hdr.crc) { pr_err("read_partial_message bad hdr " " crc %u != expected %u\n", crc, con->in_hdr.crc); -- cgit v1.2.3-59-g8ed1b From fe3ad593e2c34457ffa6233014ab19f4d36f85f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: libceph: do crc calculations outside loop Move blocks of code out of loops in read_partial_message_section() and read_partial_message(). They were only was getting called at the end of the last iteration of the loop anyway. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 7ec6a228b667..575511a29eb7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1544,10 +1544,9 @@ static int read_partial_message_section(struct ceph_connection *con, if (ret <= 0) return ret; section->iov_len += ret; - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, - section->iov_len); } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); return 1; } @@ -1638,6 +1637,7 @@ static int read_partial_message(struct ceph_connection *con) bool do_datacrc = con->msgr->nocrc; int skip; u64 seq; + u32 crc; dout("read_partial_message con %p msg %p\n", con, m); @@ -1650,18 +1650,16 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; con->in_base_pos += ret; - if (con->in_base_pos == sizeof(con->in_hdr)) { - u32 crc = crc32c(0, &con->in_hdr, - offsetof(struct ceph_msg_header, crc)); - - if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - } } + + crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->in_hdr.crc) { + pr_err("read_partial_message bad hdr " + " crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + front_len = le32_to_cpu(con->in_hdr.front_len); if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; -- cgit v1.2.3-59-g8ed1b From f42299e6c3883c69c14079b8c88fe33815b2dcc3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:54 -0600 Subject: libceph: small refactor in write_partial_kvec() Make a small change in the code that counts down kvecs consumed by a ceph_tcp_sendmsg() call. Same functionality, just blocked out a little differently. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 575511a29eb7..e8f236e87f38 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -747,17 +747,18 @@ static int write_partial_kvec(struct ceph_connection *con) con->out_kvec_bytes -= ret; if (con->out_kvec_bytes == 0) break; /* done */ - while (ret > 0) { - if (ret >= con->out_kvec_cur->iov_len) { - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } else { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - ret = 0; - break; - } + + /* account for full iov entries consumed */ + while (ret >= con->out_kvec_cur->iov_len) { + BUG_ON(!con->out_kvec_left); + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; } } con->out_kvec_left = 0; -- cgit v1.2.3-59-g8ed1b From 84495f496170a73ed79667b7fbf91947b7f47c87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Feb 2012 07:43:55 -0600 Subject: libceph: some simple changes Nothing too big here. - define the size of the buffer used for consuming ignored incoming data using a symbolic constant - simplify the condition determining whether to unmap the page in write_partial_msg_pages(): do it for crc but not if the page is the zero page Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/messenger.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e8f236e87f38..1a22975945da 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -38,6 +38,11 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; static struct lock_class_key socket_class; #endif +/* + * When skipping (ignoring) a block of input we read it into a "skip + * buffer," which is this many bytes in size. + */ +#define SKIP_BUF_SIZE 1024 static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); @@ -892,8 +897,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (do_crc && - (msg->pages || msg->pagelist || msg->bio || in_trail)) + if (do_crc && kaddr != zero_page_address) kunmap(page); if (ret == -EAGAIN) @@ -1982,8 +1986,9 @@ more: * * FIXME: there must be a better way to do this! */ - static char buf[1024]; - int skip = min(1024, -con->in_base_pos); + static char buf[SKIP_BUF_SIZE]; + int skip = min((int) sizeof (buf), -con->in_base_pos); + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); ret = ceph_tcp_recvmsg(con->sock, buf, skip); if (ret <= 0) -- cgit v1.2.3-59-g8ed1b From 37675b0f42a8f7699c3602350d1c3b2a1698a3d3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: fix inverted crc option logic CRC's are computed for all messages between ceph entities. The CRC computation for the data portion of message can optionally be disabled using the "nocrc" (common) ceph option. The default is for CRC computation for the data portion to be enabled. Unfortunately, the code that implements this feature interprets the feature flag wrong, meaning that by default the CRC's have *not* been computed (or checked) for the data portion of messages unless the "nocrc" option was supplied. Fix this, in write_partial_msg_pages() and read_partial_message(). Also change the flag variable in write_partial_msg_pages() to be "no_datacrc" to match the usage elsewhere in the file. This fixes http://tracker.newdream.net/issues/2064 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1a22975945da..589b7689d31b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -812,7 +812,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; unsigned data_len = le32_to_cpu(msg->hdr.data_len); size_t len; - bool do_crc = con->msgr->nocrc; + bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; int in_trail = 0; @@ -850,17 +850,17 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (do_crc) + if (do_datacrc) kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (do_crc) + if (do_datacrc) kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (do_crc) + if (do_datacrc) kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { @@ -869,19 +869,19 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; page_shift = bv->bv_offset; - if (do_crc) + if (do_datacrc) kaddr = kmap(page) + page_shift; max_write = bv->bv_len; #endif } else { page = zero_page; - if (do_crc) + if (do_datacrc) kaddr = zero_page_address; } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); - if (do_crc && !con->out_msg_pos.did_page_crc) { + if (do_datacrc && !con->out_msg_pos.did_page_crc) { u32 crc; void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); @@ -897,7 +897,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (do_crc && kaddr != zero_page_address) + if (do_datacrc && kaddr != zero_page_address) kunmap(page); if (ret == -EAGAIN) @@ -927,7 +927,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ - if (!do_crc) + if (!do_datacrc) con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; ceph_con_out_kvec_reset(con); prepare_write_message_footer(con); @@ -1639,7 +1639,7 @@ static int read_partial_message(struct ceph_connection *con) int ret; int to, left; unsigned front_len, middle_len, data_len; - bool do_datacrc = con->msgr->nocrc; + bool do_datacrc = !con->msgr->nocrc; int skip; u64 seq; u32 crc; -- cgit v1.2.3-59-g8ed1b From 31739139f3ed7be802dd9019ec8d8cc910e3d241 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: use kernel_sendpage() for sending zeroes If a message queued for send gets revoked, zeroes are sent over the wire instead of any unsent data. This is done by constructing a message and passing it to kernel_sendmsg() via ceph_tcp_sendmsg(). Since we are already working with a page in this case we can use the sendpage interface instead. Create a new ceph_tcp_sendpage() helper that sets up flags to match the way ceph_tcp_sendmsg() does now. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 589b7689d31b..9207a8c0b214 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -321,6 +321,19 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int more) +{ + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); + int ret; + + ret = kernel_sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + /* * Shutdown/close the socket for the given connection. @@ -944,12 +957,9 @@ static int write_partial_skip(struct ceph_connection *con) int ret; while (con->out_skip > 0) { - struct kvec iov = { - .iov_base = zero_page_address, - .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) - }; + size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); if (ret <= 0) goto out; con->out_skip -= ret; -- cgit v1.2.3-59-g8ed1b From e36b13cceb46136d849aeee06b4907ad3570ba78 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: only call kernel_sendpage() via helper Make ceph_tcp_sendpage() be the only place kernel_sendpage() is used, by using this helper in write_partial_msg_pages(). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9207a8c0b214..adca1e6537ab 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -904,17 +904,13 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } - ret = kernel_sendpage(con->sock, page, + ret = ceph_tcp_sendpage(con->sock, page, con->out_msg_pos.page_pos + page_shift, - len, - MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_MORE); + len, 1); if (do_datacrc && kaddr != zero_page_address) kunmap(page); - if (ret == -EAGAIN) - ret = 0; if (ret <= 0) goto out; -- cgit v1.2.3-59-g8ed1b From 0cdf9e60189a87356a865a96dbafc2240af5c91d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: get rid of zero_page_address There's not a lot of benefit to zero_page_address, which basically holds a mapping of the zero page through the life of the messenger module. Even with our own mapping, the sendpage interface where it's used may need to kmap() it again. It's almost certain to be in low memory anyway. So stop treating the zero page specially in write_partial_msg_pages() and just get rid of zero_page_address entirely. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index adca1e6537ab..4f1714c4c93b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -61,7 +61,6 @@ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ -static void *zero_page_address; /* kernel virtual addr of zero_page */ const char *ceph_pr_addr(const struct sockaddr_storage *ss) { @@ -111,9 +110,6 @@ void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - BUG_ON(zero_page_address == NULL); - zero_page_address = NULL; - BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -126,9 +122,6 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); - BUG_ON(zero_page_address != NULL); - zero_page_address = kmap(zero_page); - ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -889,7 +882,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) } else { page = zero_page; if (do_datacrc) - kaddr = zero_page_address; + kaddr = kmap(page); } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -908,7 +901,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos + page_shift, len, 1); - if (do_datacrc && kaddr != zero_page_address) + if (do_datacrc) kunmap(page); if (ret <= 0) -- cgit v1.2.3-59-g8ed1b From 9bd1966344bf975b5ce65e80fd6bacc41b4325a8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: rename "page_shift" variable to something sensible In write_partial_msg_pages() there is a local variable used to track the starting offset within a bio segment to use. Its name, "page_shift" defies the Linux convention of using that name for log-base-2(page size). Since it's only used in the bio case rename it "bio_offset". Use it along with the page_pos field to compute the memory offset when computing CRC's in that function. This makes the bio case match the others more closely. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4f1714c4c93b..2bf9ab4429e6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -837,7 +837,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct page *page = NULL; void *kaddr = NULL; int max_write = PAGE_SIZE; - int page_shift = 0; + int bio_offset = 0; total_max_write = data_len - trail_len - con->out_msg_pos.data_pos; @@ -874,9 +874,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; - page_shift = bv->bv_offset; + bio_offset = bv->bv_offset; if (do_datacrc) - kaddr = kmap(page) + page_shift; + kaddr = kmap(page); max_write = bv->bv_len; #endif } else { @@ -888,17 +888,18 @@ static int write_partial_msg_pages(struct ceph_connection *con) total_max_write); if (do_datacrc && !con->out_msg_pos.did_page_crc) { + void *base; u32 crc; - void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); BUG_ON(kaddr == NULL); + base = kaddr + con->out_msg_pos.page_pos + bio_offset; crc = crc32c(tmpcrc, base, len); con->out_msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + page_shift, + con->out_msg_pos.page_pos + bio_offset, len, 1); if (do_datacrc) -- cgit v1.2.3-59-g8ed1b From 8d63e318c4eb1bea6f7e3cb4b77849eaa167bfec Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 7 Mar 2012 11:40:08 -0600 Subject: libceph: isolate kmap() call in write_partial_msg_pages() In write_partial_msg_pages(), every case now does an identical call to kmap(page). Instead, just call it once inside the CRC-computing block where it's needed. Move the definition of kaddr inside that block, and make it a (char *) to ensure portable pointer arithmetic. We still don't kunmap() it until after the sendpage() call, in case that also ends up needing to use the mapping. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2bf9ab4429e6..f0993af2ae4d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -835,7 +835,6 @@ static int write_partial_msg_pages(struct ceph_connection *con) while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; - void *kaddr = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; @@ -856,18 +855,12 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = list_first_entry(&msg->trail->head, struct page, lru); - if (do_datacrc) - kaddr = kmap(page); max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; - if (do_datacrc) - kaddr = kmap(page); } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); - if (do_datacrc) - kaddr = kmap(page); #ifdef CONFIG_BLOCK } else if (msg->bio) { struct bio_vec *bv; @@ -875,14 +868,10 @@ static int write_partial_msg_pages(struct ceph_connection *con) bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); page = bv->bv_page; bio_offset = bv->bv_offset; - if (do_datacrc) - kaddr = kmap(page); max_write = bv->bv_len; #endif } else { page = zero_page; - if (do_datacrc) - kaddr = kmap(page); } len = min_t(int, max_write - con->out_msg_pos.page_pos, total_max_write); @@ -891,7 +880,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) void *base; u32 crc; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + char *kaddr; + kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; crc = crc32c(tmpcrc, base, len); -- cgit v1.2.3-59-g8ed1b