aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lustre/ptlrpc/client.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lustre/ptlrpc/client.c')
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/client.c338
1 files changed, 270 insertions, 68 deletions
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
index 8c51d51a678b..804741362bc0 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -43,6 +43,18 @@
#include "ptlrpc_internal.h"
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
+ .add_kiov_frag = ptlrpc_prep_bulk_page_pin,
+ .release_frags = ptlrpc_release_bulk_page_pin,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
+ .add_kiov_frag = ptlrpc_prep_bulk_page_nopin,
+ .release_frags = NULL,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
+
static int ptlrpc_send_new_req(struct ptlrpc_request *req);
static int ptlrpcd_check_work(struct ptlrpc_request *req);
static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
@@ -95,24 +107,43 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
* Allocate and initialize new bulk descriptor on the sender.
* Returns pointer to the descriptor or NULL on error.
*/
-struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
- unsigned type, unsigned portal)
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags,
+ unsigned int max_brw,
+ enum ptlrpc_bulk_op_type type,
+ unsigned int portal,
+ const struct ptlrpc_bulk_frag_ops *ops)
{
struct ptlrpc_bulk_desc *desc;
int i;
- desc = kzalloc(offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]),
- GFP_NOFS);
+ /* ensure that only one of KIOV or IOVEC is set but not both */
+ LASSERT((ptlrpc_is_bulk_desc_kiov(type) && ops->add_kiov_frag) ||
+ (ptlrpc_is_bulk_desc_kvec(type) && ops->add_iov_frag));
+
+ desc = kzalloc(sizeof(*desc), GFP_NOFS);
if (!desc)
return NULL;
+ if (type & PTLRPC_BULK_BUF_KIOV) {
+ GET_KIOV(desc) = kcalloc(nfrags, sizeof(*GET_KIOV(desc)),
+ GFP_NOFS);
+ if (!GET_KIOV(desc))
+ goto free_desc;
+ } else {
+ GET_KVEC(desc) = kcalloc(nfrags, sizeof(*GET_KVEC(desc)),
+ GFP_NOFS);
+ if (!GET_KVEC(desc))
+ goto free_desc;
+ }
+
spin_lock_init(&desc->bd_lock);
init_waitqueue_head(&desc->bd_waitq);
- desc->bd_max_iov = npages;
+ desc->bd_max_iov = nfrags;
desc->bd_iov_count = 0;
desc->bd_portal = portal;
desc->bd_type = type;
desc->bd_md_count = 0;
+ desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *)ops;
LASSERT(max_brw > 0);
desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
/*
@@ -123,24 +154,31 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
LNetInvalidateHandle(&desc->bd_mds[i]);
return desc;
+free_desc:
+ kfree(desc);
+ return NULL;
}
/**
* Prepare bulk descriptor for specified outgoing request \a req that
- * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
* the bulk to be sent. Used on client-side.
* Returns pointer to newly allocated initialized bulk descriptor or NULL on
* error.
*/
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
- unsigned npages, unsigned max_brw,
- unsigned type, unsigned portal)
+ unsigned int nfrags,
+ unsigned int max_brw,
+ unsigned int type,
+ unsigned int portal,
+ const struct ptlrpc_bulk_frag_ops *ops)
{
struct obd_import *imp = req->rq_import;
struct ptlrpc_bulk_desc *desc;
- LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
- desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+ LASSERT(ptlrpc_is_bulk_op_passive(type));
+
+ desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
if (!desc)
return NULL;
@@ -158,56 +196,82 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
}
EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
-/**
- * Add a page \a page to the bulk descriptor \a desc.
- * Data to transfer in the page starts at offset \a pageoffset and
- * amount of data to transfer from the page is \a len
- */
void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
struct page *page, int pageoffset, int len, int pin)
{
+ struct bio_vec *kiov;
+
LASSERT(desc->bd_iov_count < desc->bd_max_iov);
LASSERT(page);
LASSERT(pageoffset >= 0);
LASSERT(len > 0);
LASSERT(pageoffset + len <= PAGE_SIZE);
+ LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+ kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
desc->bd_nob += len;
if (pin)
get_page(page);
- ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+ kiov->bv_page = page;
+ kiov->bv_offset = pageoffset;
+ kiov->bv_len = len;
+
+ desc->bd_iov_count++;
}
EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
-/**
- * Uninitialize and free bulk descriptor \a desc.
- * Works on bulk descriptors both from server and client side.
- */
-void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
+ void *frag, int len)
{
- int i;
+ struct kvec *iovec;
+
+ LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+ LASSERT(frag);
+ LASSERT(len > 0);
+ LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
+ iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
+
+ desc->bd_nob += len;
+
+ iovec->iov_base = frag;
+ iovec->iov_len = len;
+
+ desc->bd_iov_count++;
+
+ return desc->bd_nob;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_frag);
+
+void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
+{
LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
LASSERT(desc->bd_md_count == 0); /* network hands off */
LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+ LASSERT(desc->bd_frag_ops);
- sptlrpc_enc_pool_put_pages(desc);
+ if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+ sptlrpc_enc_pool_put_pages(desc);
if (desc->bd_export)
class_export_put(desc->bd_export);
else
class_import_put(desc->bd_import);
- if (unpin) {
- for (i = 0; i < desc->bd_iov_count; i++)
- put_page(desc->bd_iov[i].bv_page);
- }
+ if (desc->bd_frag_ops->release_frags)
+ desc->bd_frag_ops->release_frags(desc);
+
+ if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+ kfree(GET_KIOV(desc));
+ else
+ kfree(GET_KVEC(desc));
kfree(desc);
}
-EXPORT_SYMBOL(__ptlrpc_free_bulk);
+EXPORT_SYMBOL(ptlrpc_free_bulk);
/**
* Set server timelimit for this req, i.e. how long are we willing to wait
@@ -589,6 +653,42 @@ static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
spin_unlock(&pool->prp_lock);
}
+void ptlrpc_add_unreplied(struct ptlrpc_request *req)
+{
+ struct obd_import *imp = req->rq_import;
+ struct list_head *tmp;
+ struct ptlrpc_request *iter;
+
+ assert_spin_locked(&imp->imp_lock);
+ LASSERT(list_empty(&req->rq_unreplied_list));
+
+ /* unreplied list is sorted by xid in ascending order */
+ list_for_each_prev(tmp, &imp->imp_unreplied_list) {
+ iter = list_entry(tmp, struct ptlrpc_request,
+ rq_unreplied_list);
+
+ LASSERT(req->rq_xid != iter->rq_xid);
+ if (req->rq_xid < iter->rq_xid)
+ continue;
+ list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
+ return;
+ }
+ list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
+}
+
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
+{
+ req->rq_xid = ptlrpc_next_xid();
+ ptlrpc_add_unreplied(req);
+}
+
+static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
+{
+ spin_lock(&req->rq_import->imp_lock);
+ ptlrpc_assign_next_xid_nolock(req);
+ spin_unlock(&req->rq_import->imp_lock);
+}
+
int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
__u32 version, int opcode, char **bufs,
struct ptlrpc_cli_ctx *ctx)
@@ -637,8 +737,8 @@ int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
ptlrpc_at_set_req_timeout(request);
- request->rq_xid = ptlrpc_next_xid();
lustre_msg_set_opc(request->rq_reqmsg, opcode);
+ ptlrpc_assign_next_xid(request);
/* Let's setup deadline for req/reply/bulk unlink for opcode. */
if (cfs_fail_val == opcode) {
@@ -1129,7 +1229,9 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
lnet_nid_t nid = imp->imp_connection->c_peer.nid;
__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
- if (ptlrpc_console_allow(req))
+ /* -EAGAIN is normal when using POSIX flocks */
+ if (ptlrpc_console_allow(req) &&
+ !(opc == LDLM_ENQUEUE && err == -EAGAIN))
LCONSOLE_ERROR_MSG(0x011, "%s: operation %s to node %s failed: rc = %d\n",
imp->imp_obd->obd_name,
ll_opcode2str(opc),
@@ -1166,6 +1268,24 @@ static void ptlrpc_save_versions(struct ptlrpc_request *req)
versions[0], versions[1]);
}
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp)
+{
+ struct ptlrpc_request *req;
+
+ assert_spin_locked(&imp->imp_lock);
+ if (list_empty(&imp->imp_unreplied_list))
+ return 0;
+
+ req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
+ rq_unreplied_list);
+ LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
+
+ if (imp->imp_known_replied_xid < req->rq_xid - 1)
+ imp->imp_known_replied_xid = req->rq_xid - 1;
+
+ return req->rq_xid - 1;
+}
+
/**
* Callback function called when client receives RPC reply for \a req.
* Returns 0 on success or error code.
@@ -1180,6 +1300,7 @@ static int after_reply(struct ptlrpc_request *req)
int rc;
struct timespec64 work_start;
long timediff;
+ u64 committed;
LASSERT(obd);
/* repbuf must be unlinked */
@@ -1206,6 +1327,10 @@ static int after_reply(struct ptlrpc_request *req)
return 0;
}
+ ktime_get_real_ts64(&work_start);
+ timediff = (work_start.tv_sec - req->rq_sent_tv.tv_sec) * USEC_PER_SEC +
+ (work_start.tv_nsec - req->rq_sent_tv.tv_nsec) /
+ NSEC_PER_USEC;
/*
* NB Until this point, the whole of the incoming message,
* including buflens, status etc is in the sender's byte order.
@@ -1235,13 +1360,6 @@ static int after_reply(struct ptlrpc_request *req)
spin_unlock(&req->rq_lock);
req->rq_nr_resend++;
- /* allocate new xid to avoid reply reconstruction */
- if (!req->rq_bulk) {
- /* new xid is already allocated for bulk in ptlrpc_check_set() */
- req->rq_xid = ptlrpc_next_xid();
- DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS");
- }
-
/* Readjust the timeout for current conditions */
ptlrpc_at_set_req_timeout(req);
/*
@@ -1255,13 +1373,14 @@ static int after_reply(struct ptlrpc_request *req)
else
req->rq_sent = now + req->rq_nr_resend;
+ /* Resend for EINPROGRESS will use a new XID */
+ spin_lock(&imp->imp_lock);
+ list_del_init(&req->rq_unreplied_list);
+ spin_unlock(&imp->imp_lock);
+
return 0;
}
- ktime_get_real_ts64(&work_start);
- timediff = (work_start.tv_sec - req->rq_sent_tv.tv_sec) * USEC_PER_SEC +
- (work_start.tv_nsec - req->rq_sent_tv.tv_nsec) /
- NSEC_PER_USEC;
if (obd->obd_svc_stats) {
lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
timediff);
@@ -1338,10 +1457,9 @@ static int after_reply(struct ptlrpc_request *req)
}
/* Replay-enabled imports return commit-status information. */
- if (lustre_msg_get_last_committed(req->rq_repmsg)) {
- imp->imp_peer_committed_transno =
- lustre_msg_get_last_committed(req->rq_repmsg);
- }
+ committed = lustre_msg_get_last_committed(req->rq_repmsg);
+ if (likely(committed > imp->imp_peer_committed_transno))
+ imp->imp_peer_committed_transno = committed;
ptlrpc_free_committed(imp);
@@ -1373,9 +1491,17 @@ static int after_reply(struct ptlrpc_request *req)
static int ptlrpc_send_new_req(struct ptlrpc_request *req)
{
struct obd_import *imp = req->rq_import;
+ u64 min_xid = 0;
int rc;
LASSERT(req->rq_phase == RQ_PHASE_NEW);
+
+ /* do not try to go further if there is not enough memory in enc_pool */
+ if (req->rq_sent && req->rq_bulk)
+ if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
+ pool_is_at_full_capacity())
+ return -ENOMEM;
+
if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
(!req->rq_generation_set ||
req->rq_import_generation == imp->imp_generation))
@@ -1385,6 +1511,9 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
spin_lock(&imp->imp_lock);
+ LASSERT(req->rq_xid);
+ LASSERT(!list_empty(&req->rq_unreplied_list));
+
if (!req->rq_generation_set)
req->rq_import_generation = imp->imp_generation;
@@ -1414,8 +1543,25 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
LASSERT(list_empty(&req->rq_list));
list_add_tail(&req->rq_list, &imp->imp_sending_list);
atomic_inc(&req->rq_import->imp_inflight);
+
+ /* find the known replied XID from the unreplied list, CONNECT
+ * and DISCONNECT requests are skipped to make the sanity check
+ * on server side happy. see process_req_last_xid().
+ *
+ * For CONNECT: Because replay requests have lower XID, it'll
+ * break the sanity check if CONNECT bump the exp_last_xid on
+ * server.
+ *
+ * For DISCONNECT: Since client will abort inflight RPC before
+ * sending DISCONNECT, DISCONNECT may carry an XID which higher
+ * than the inflight RPC.
+ */
+ if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
+ min_xid = ptlrpc_known_replied_xid(imp);
spin_unlock(&imp->imp_lock);
+ lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
+
lustre_msg_set_status(req->rq_reqmsg, current_pid());
rc = sptlrpc_req_refresh_ctx(req, -1);
@@ -1438,6 +1584,16 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
lustre_msg_get_opc(req->rq_reqmsg));
rc = ptl_send_rpc(req, 0);
+ if (rc == -ENOMEM) {
+ spin_lock(&imp->imp_lock);
+ if (!list_empty(&req->rq_list)) {
+ list_del_init(&req->rq_list);
+ atomic_dec(&req->rq_import->imp_inflight);
+ }
+ spin_unlock(&imp->imp_lock);
+ ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+ return rc;
+ }
if (rc) {
DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
spin_lock(&req->rq_lock);
@@ -1688,18 +1844,9 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
spin_lock(&req->rq_lock);
req->rq_resend = 1;
spin_unlock(&req->rq_lock);
- if (req->rq_bulk) {
- __u64 old_xid;
-
- if (!ptlrpc_unregister_bulk(req, 1))
- continue;
-
- /* ensure previous bulk fails */
- old_xid = req->rq_xid;
- req->rq_xid = ptlrpc_next_xid();
- CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
- old_xid, req->rq_xid);
- }
+ if (req->rq_bulk &&
+ !ptlrpc_unregister_bulk(req, 1))
+ continue;
}
/*
* rq_wait_ctx is only touched by ptlrpcd,
@@ -1727,6 +1874,14 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
}
rc = ptl_send_rpc(req, 0);
+ if (rc == -ENOMEM) {
+ spin_lock(&imp->imp_lock);
+ if (!list_empty(&req->rq_list))
+ list_del_init(&req->rq_list);
+ spin_unlock(&imp->imp_lock);
+ ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+ continue;
+ }
if (rc) {
DEBUG_REQ(D_HA, req,
"send failed: rc = %d", rc);
@@ -1850,6 +2005,7 @@ interpret:
list_del_init(&req->rq_list);
atomic_dec(&imp->imp_inflight);
}
+ list_del_init(&req->rq_unreplied_list);
spin_unlock(&imp->imp_lock);
atomic_dec(&set->set_remaining);
@@ -2247,6 +2403,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
if (!locked)
spin_lock(&request->rq_import->imp_lock);
list_del_init(&request->rq_replay_list);
+ list_del_init(&request->rq_unreplied_list);
if (!locked)
spin_unlock(&request->rq_import->imp_lock);
}
@@ -2266,7 +2423,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
request->rq_import = NULL;
}
if (request->rq_bulk)
- ptlrpc_free_bulk_pin(request->rq_bulk);
+ ptlrpc_free_bulk(request->rq_bulk);
if (request->rq_reqbuf || request->rq_clrbuf)
sptlrpc_cli_free_reqbuf(request);
@@ -2542,14 +2699,6 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
req->rq_resend = 1;
req->rq_net_err = 0;
req->rq_timedout = 0;
- if (req->rq_bulk) {
- __u64 old_xid = req->rq_xid;
-
- /* ensure previous bulk fails */
- req->rq_xid = ptlrpc_next_xid();
- CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
- old_xid, req->rq_xid);
- }
ptlrpc_client_wake_req(req);
spin_unlock(&req->rq_lock);
}
@@ -2592,6 +2741,10 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+ spin_lock(&req->rq_lock);
+ req->rq_resend = 0;
+ spin_unlock(&req->rq_lock);
+
LASSERT(imp->imp_replayable);
/* Balanced in ptlrpc_free_committed, usually. */
ptlrpc_request_addref(req);
@@ -2667,8 +2820,15 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
atomic_dec(&imp->imp_replay_inflight);
- if (!ptlrpc_client_replied(req)) {
- CERROR("request replay timed out, restarting recovery\n");
+ /*
+ * Note: if it is bulk replay (MDS-MDS replay), then even if
+ * server got the request, but bulk transfer timeout, let's
+ * replay the bulk req again
+ */
+ if (!ptlrpc_client_replied(req) ||
+ (req->rq_bulk &&
+ lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
+ DEBUG_REQ(D_ERROR, req, "request replay timed out.\n");
rc = -ETIMEDOUT;
goto out;
}
@@ -2939,6 +3099,48 @@ __u64 ptlrpc_next_xid(void)
}
/**
+ * If request has a new allocated XID (new request or EINPROGRESS resend),
+ * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
+ * request to ensure previous bulk fails and avoid problems with lost replies
+ * and therefore several transfers landing into the same buffer from different
+ * sending attempts.
+ */
+void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
+{
+ struct ptlrpc_bulk_desc *bd = req->rq_bulk;
+
+ LASSERT(bd);
+
+ if (!req->rq_resend) {
+ /* this request has a new xid, just use it as bulk matchbits */
+ req->rq_mbits = req->rq_xid;
+
+ } else { /* needs to generate a new matchbits for resend */
+ u64 old_mbits = req->rq_mbits;
+
+ if ((bd->bd_import->imp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_BULK_MBITS)) {
+ req->rq_mbits = ptlrpc_next_xid();
+ } else {
+ /* old version transfers rq_xid to peer as matchbits */
+ req->rq_mbits = ptlrpc_next_xid();
+ req->rq_xid = req->rq_mbits;
+ }
+
+ CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+ old_mbits, req->rq_mbits);
+ }
+
+ /*
+ * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
+ * that server can infer the number of bulks that were prepared,
+ * see LU-1431
+ */
+ req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
+ LNET_MAX_IOV) - 1;
+}
+
+/**
* Get a glimpse at what next xid value might have been.
* Returns possible next xid.
*/