1 files changed, 100 insertions, 86 deletions
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
index 2a7a70aa9e80..e356e4af08e1 100644
--- a/drivers/staging/lustre/lustre/osc/osc_page.c
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -37,6 +37,7 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
+#include <linux/math64.h>
 #include "osc_cl_internal.h"
 
 static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
@@ -86,11 +87,6 @@ static void osc_page_transfer_add(const struct lu_env *env,
 	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
 
 	osc_lru_use(osc_cli(obj), opg);
-
-	spin_lock(&obj->oo_seatbelt);
-	list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
-	opg->ops_submitter = current;
-	spin_unlock(&obj->oo_seatbelt);
 }
 
 int osc_page_cache_add(const struct lu_env *env,
@@ -109,7 +105,8 @@ int osc_page_cache_add(const struct lu_env *env,
 	return result;
 }
 
-void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+void osc_index2policy(union ldlm_policy_data *policy,
+		      const struct cl_object *obj,
 		      pgoff_t start, pgoff_t end)
 {
 	memset(policy, 0, sizeof(*policy));
@@ -117,25 +114,6 @@ void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
 	policy->l_extent.end = cl_offset(obj, end + 1) - 1;
 }
 
-static int osc_page_is_under_lock(const struct lu_env *env,
-				  const struct cl_page_slice *slice,
-				  struct cl_io *unused, pgoff_t *max_index)
-{
-	struct osc_page *opg = cl2osc_page(slice);
-	struct ldlm_lock *dlmlock;
-	int result = -ENODATA;
-
-	dlmlock = osc_dlmlock_at_pgoff(env, cl2osc(slice->cpl_obj),
-				       osc_index(opg), 1, 0);
-	if (dlmlock) {
-		*max_index = cl_index(slice->cpl_obj,
-				      dlmlock->l_policy_data.l_extent.end);
-		LDLM_LOCK_PUT(dlmlock);
-		result = 0;
-	}
-	return result;
-}
-
 static const char *osc_list(struct list_head *head)
 {
 	return list_empty(head) ? "-" : "+";
@@ -158,7 +136,7 @@ static int osc_page_print(const struct lu_env *env,
 	struct osc_object *obj = cl2osc(slice->cpl_obj);
 	struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli;
 
-	return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p %lu: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %s %p %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n",
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p %lu: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n",
 			  opg, osc_index(opg),
 			  /* 1 */
 			  oap->oap_magic, oap->oap_cmd,
@@ -170,8 +148,7 @@ static int osc_page_print(const struct lu_env *env,
 			  oap->oap_async_flags, oap->oap_brw_flags,
 			  oap->oap_request, oap->oap_cli, obj,
 			  /* 3 */
-			  osc_list(&opg->ops_inflight),
-			  opg->ops_submitter, opg->ops_transfer_pinned,
+			  opg->ops_transfer_pinned,
 			  osc_submit_duration(opg), opg->ops_srvlock,
 			  /* 4 */
 			  cli->cl_r_in_flight, cli->cl_w_in_flight,
@@ -210,14 +187,6 @@ static void osc_page_delete(const struct lu_env *env,
 		LASSERT(0);
 	}
 
-	spin_lock(&obj->oo_seatbelt);
-	if (opg->ops_submitter) {
-		LASSERT(!list_empty(&opg->ops_inflight));
-		list_del_init(&opg->ops_inflight);
-		opg->ops_submitter = NULL;
-	}
-	spin_unlock(&obj->oo_seatbelt);
-
 	osc_lru_del(osc_cli(obj), opg);
 
 	if (slice->cpl_page->cp_type == CPT_CACHEABLE) {
@@ -276,7 +245,6 @@ static int osc_page_flush(const struct lu_env *env,
 static const struct cl_page_operations osc_page_ops = {
 	.cpo_print	 = osc_page_print,
 	.cpo_delete	= osc_page_delete,
-	.cpo_is_under_lock = osc_page_is_under_lock,
 	.cpo_clip	   = osc_page_clip,
 	.cpo_cancel	 = osc_page_cancel,
 	.cpo_flush	  = osc_page_flush
@@ -301,10 +269,6 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 		cl_page_slice_add(page, &opg->ops_cl, obj, index,
 				  &osc_page_ops);
 	}
-	/* ops_inflight and ops_lru are the same field, but it doesn't
-	 * hurt to initialize it twice :-)
-	 */
-	INIT_LIST_HEAD(&opg->ops_inflight);
 	INIT_LIST_HEAD(&opg->ops_lru);
 
 	/* reserve an LRU space for this page */
@@ -362,16 +326,27 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
  * OSC to free slots voluntarily to maintain a reasonable number of free slots
  * at any time.
  */
-
 static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
-/* LRU pages are freed in batch mode. OSC should at least free this
- * number of pages to avoid running out of LRU budget, and..
+
+/**
+ * LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU slots.
+ */
+static inline int lru_shrink_min(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * 2;
+}
+
+/**
+ * free this number at most otherwise it will take too long time to finish.
  */
-static const int lru_shrink_min = 2 << (20 - PAGE_SHIFT);  /* 2M */
-/* free this number at most otherwise it will take too long time to finish. */
-static const int lru_shrink_max = 8 << (20 - PAGE_SHIFT); /* 8M */
+static inline int lru_shrink_max(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
+}
 
-/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+/**
+ * Check if we can free LRU slots from this OSC. If there exists LRU waiters,
  * we should free slots aggressively. In this way, slots are freed in a steady
  * step to maintain fairness among OSCs.
  *
@@ -388,13 +363,20 @@ static int osc_cache_too_much(struct client_obd *cli)
 	/* if it's going to run out LRU slots, we should free some, but not
 	 * too much to maintain fairness among OSCs.
 	 */
-	if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+	if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) {
 		if (pages >= budget)
-			return lru_shrink_max;
+			return lru_shrink_max(cli);
 		else if (pages >= budget / 2)
-			return lru_shrink_min;
-	} else if (pages >= budget * 2) {
-		return lru_shrink_min;
+			return lru_shrink_min(cli);
+	} else {
+		time64_t duration = ktime_get_real_seconds();
+
+		/* knock out pages by duration of no IO activity */
+		duration -= cli->cl_lru_last_used;
+		duration >>= 6; /* approximately 1 minute */
+		if (duration > 0 &&
+		    pages >= div64_s64((s64)budget, duration))
+			return lru_shrink_min(cli);
 	}
 	return 0;
 }
@@ -402,11 +384,21 @@ static int osc_cache_too_much(struct client_obd *cli)
 int lru_queue_work(const struct lu_env *env, void *data)
 {
 	struct client_obd *cli = data;
+	int count;
 
-	CDEBUG(D_CACHE, "Run LRU work for client obd %p.\n", cli);
+	CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli));
 
-	if (osc_cache_too_much(cli))
-		osc_lru_shrink(env, cli, lru_shrink_max, true);
+	count = osc_cache_too_much(cli);
+	if (count > 0) {
+		int rc = osc_lru_shrink(env, cli, count, false);
+
+		CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n",
+		       cli_name(cli), rc, count);
+		if (rc >= count) {
+			CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli));
+			ptlrpcd_queue_work(cli->cl_lru_work);
+		}
+	}
 
 	return 0;
 }
@@ -433,10 +425,10 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
 		list_splice_tail(&lru, &cli->cl_lru_list);
 		atomic_long_sub(npages, &cli->cl_lru_busy);
 		atomic_long_add(npages, &cli->cl_lru_in_list);
+		cli->cl_lru_last_used = ktime_get_real_seconds();
 		spin_unlock(&cli->cl_lru_list_lock);
 
-		/* XXX: May set force to be true for better performance */
-		if (osc_cache_too_much(cli))
+		if (waitqueue_active(&osc_lru_waitq))
 			(void)ptlrpcd_queue_work(cli->cl_lru_work);
 	}
 }
@@ -469,8 +461,10 @@ static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
 		 * this osc occupies too many LRU pages and kernel is
 		 * stealing one of them.
 		 */
-		if (!memory_pressure_get())
+		if (osc_cache_too_much(cli)) {
+			CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli));
 			(void)ptlrpcd_queue_work(cli->cl_lru_work);
+		}
 		wake_up(&osc_lru_waitq);
 	} else {
 		LASSERT(list_empty(&opg->ops_lru));
@@ -502,6 +496,7 @@ static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
 		struct cl_page *page = pvec[i];
 
 		LASSERT(cl_page_is_owned(page, io));
+		cl_page_delete(env, page);
 		cl_page_discard(env, io, page);
 		cl_page_disown(env, io, page);
 		cl_page_put(env, page);
@@ -542,7 +537,6 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	struct cl_object *clobj = NULL;
 	struct cl_page **pvec;
 	struct osc_page *opg;
-	struct osc_page *temp;
 	int maxscan = 0;
 	long count = 0;
 	int index = 0;
@@ -552,6 +546,8 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
 		return 0;
 
+	CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
+	       cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
 	if (!force) {
 		if (atomic_read(&cli->cl_lru_shrinkers) > 0)
 			return -EBUSY;
@@ -568,14 +564,21 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	io = &osc_env_info(env)->oti_io;
 
 	spin_lock(&cli->cl_lru_list_lock);
+	if (force)
+		cli->cl_lru_reclaim++;
 	maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
-	list_for_each_entry_safe(opg, temp, &cli->cl_lru_list, ops_lru) {
+	while (!list_empty(&cli->cl_lru_list)) {
 		struct cl_page *page;
 		bool will_free = false;
 
+		if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
+			break;
+
 		if (--maxscan < 0)
 			break;
 
+		opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+				 ops_lru);
 		page = opg->ops_cl.cpl_page;
 		if (lru_page_busy(cli, page)) {
 			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
@@ -662,34 +665,43 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	return count > 0 ? count : rc;
 }
 
-long osc_lru_reclaim(struct client_obd *cli)
+/**
+ * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
+ * \@npages of LRU slots. For performance consideration, it's better to drop
+ * LRU pages in batch. Therefore, the actual number is adjusted at least
+ * max_pages_per_rpc.
+ */
+long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
 {
-	struct cl_env_nest nest;
 	struct lu_env *env;
 	struct cl_client_cache *cache = cli->cl_cache;
 	int max_scans;
+	int refcheck;
 	long rc = 0;
 
 	LASSERT(cache);
 
-	env = cl_env_nested_get(&nest);
+	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return 0;
 
-	rc = osc_lru_shrink(env, cli, osc_cache_too_much(cli), false);
-	if (rc != 0) {
-		if (rc == -EBUSY)
-			rc = 0;
-
-		CDEBUG(D_CACHE, "%s: Free %ld pages from own LRU: %p.\n",
-		       cli->cl_import->imp_obd->obd_name, rc, cli);
+	npages = max_t(int, npages, cli->cl_max_pages_per_rpc);
+	CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n",
+	       cli_name(cli), npages);
+	rc = osc_lru_shrink(env, cli, npages, true);
+	if (rc >= npages) {
+		CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n",
+		       cli_name(cli), rc, npages);
+		if (osc_cache_too_much(cli) > 0)
+			ptlrpcd_queue_work(cli->cl_lru_work);
 		goto out;
+	} else if (rc > 0) {
+		npages -= rc;
 	}
 
-	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld, busy: %ld.\n",
-	       cli->cl_import->imp_obd->obd_name, cli,
-	       atomic_long_read(&cli->cl_lru_in_list),
-	       atomic_long_read(&cli->cl_lru_busy));
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
+	       cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+	       atomic_long_read(&cli->cl_lru_busy), npages);
 
 	/* Reclaim LRU slots from other client_obd as it can't free enough
 	 * from its own. This should rarely happen.
@@ -706,7 +718,7 @@ long osc_lru_reclaim(struct client_obd *cli)
 				 cl_lru_osc);
 
 		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
-		       cli->cl_import->imp_obd->obd_name, cli,
+		       cli_name(cli), cli,
 		       atomic_long_read(&cli->cl_lru_in_list),
 		       atomic_long_read(&cli->cl_lru_busy));
 
@@ -714,19 +726,20 @@ long osc_lru_reclaim(struct client_obd *cli)
 		if (osc_cache_too_much(cli) > 0) {
 			spin_unlock(&cache->ccc_lru_lock);
 
-			rc = osc_lru_shrink(env, cli, osc_cache_too_much(cli),
-					    true);
+			rc = osc_lru_shrink(env, cli, npages, true);
 			spin_lock(&cache->ccc_lru_lock);
-			if (rc != 0)
+			if (rc >= npages)
 				break;
+			if (rc > 0)
+				npages -= rc;
 		}
 	}
 	spin_unlock(&cache->ccc_lru_lock);
 
 out:
-	cl_env_nested_put(&nest, env);
+	cl_env_put(env, &refcheck);
 	CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
-	       cli->cl_import->imp_obd->obd_name, cli, rc);
+	       cli_name(cli), cli, rc);
 	return rc;
 }
 
@@ -756,7 +769,7 @@ static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
 	LASSERT(atomic_long_read(cli->cl_lru_left) >= 0);
 	while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) {
 		/* run out of LRU spaces, try to drop some by itself */
-		rc = osc_lru_reclaim(cli);
+		rc = osc_lru_reclaim(cli, 1);
 		if (rc < 0)
 			break;
 		if (rc > 0)
@@ -796,8 +809,10 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 	int count = 0;
 	int i;
 
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
 	for (i = 0; i < page_count; i++) {
-		pg_data_t *pgdat = page_pgdat(desc->bd_iov[i].bv_page);
+		pg_data_t *pgdat = page_pgdat(BD_GET_KIOV(desc, i).bv_page);
 
 		if (likely(pgdat == last)) {
 			++count;
@@ -857,7 +872,7 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
 	if (!unstable_count)
 		wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
 
-	if (osc_cache_too_much(cli))
+	if (waitqueue_active(&osc_lru_waitq))
 		(void)ptlrpcd_queue_work(cli->cl_lru_work);
 }
 
@@ -913,8 +928,7 @@ bool osc_over_unstable_soft_limit(struct client_obd *cli)
 
 	CDEBUG(D_CACHE,
 	       "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n",
-	       cli->cl_import->imp_obd->obd_name, cli,
-	       unstable_nr, osc_unstable_count);
+	       cli_name(cli), cli, unstable_nr, osc_unstable_count);
 
 	/*
 	 * If the LRU slots are in shortage - 25% remaining AND this OSC