From 3b8cafdd5436f9298b3bf6eb831df5eef5ee82b6 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 16 Jul 2019 14:39:34 +0900
Subject: dm zoned: fix zone state management race

dm-zoned uses the zone flag DMZ_ACTIVE to indicate that a zone of the
backend device is being actively read or written and so cannot be
reclaimed. This flag is set as long as the zone atomic reference
counter is not 0. When this atomic is decremented and reaches 0 (e.g.
on BIO completion), the active flag is cleared and set again whenever
the zone is reused and BIO issued with the atomic counter incremented.
These 2 operations (atomic inc/dec and flag set/clear) are however not
always executed atomically under the target metadata mutex lock and
this causes the warning:

WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));

in dmz_deactivate_zone() to be displayed. This problem is regularly
triggered with xfstests generic/209, generic/300, generic/451 and
xfs/077 with XFS being used as the file system on the dm-zoned target
device. Similarly, xfstests ext4/303, ext4/304, generic/209 and
generic/300 trigger the warning with ext4 use.

This problem can be easily fixed by simply removing the DMZ_ACTIVE flag
and managing the "ACTIVE" state by directly looking at the reference
counter value. To do so, the functions dmz_activate_zone() and
dmz_deactivate_zone() are changed to inline functions respectively
calling atomic_inc() and atomic_dec(), while the dmz_is_active() macro
is changed to an inline function calling atomic_read().

Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
Cc: stable@vger.kernel.org
Reported-by: Masato Suzuki <masato.suzuki@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-zoned-metadata.c | 24 ------------------------
 drivers/md/dm-zoned.h          | 28 ++++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..4cdde7a02e94 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1593,30 +1593,6 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
 	return zone;
 }
 
-/*
- * Activate a zone (increment its reference count).
- */
-void dmz_activate_zone(struct dm_zone *zone)
-{
-	set_bit(DMZ_ACTIVE, &zone->flags);
-	atomic_inc(&zone->refcount);
-}
-
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * and clears the active state of the zone once the count reaches 0,
- * indicating that all BIOs to the zone have completed. Returns
- * true if the zone was deactivated.
- */
-void dmz_deactivate_zone(struct dm_zone *zone)
-{
-	if (atomic_dec_and_test(&zone->refcount)) {
-		WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
-		clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
-		smp_mb__after_atomic();
-	}
-}
-
 /*
  * Get the zone mapping a chunk, if the chunk is mapped already.
  * If no mapping exist and the operation is WRITE, a zone is
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 12419f0bfe78..ed8de49c9a08 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -115,7 +115,6 @@ enum {
 	DMZ_BUF,
 
 	/* Zone internal state */
-	DMZ_ACTIVE,
 	DMZ_RECLAIM,
 	DMZ_SEQ_WRITE_ERR,
 };
@@ -128,7 +127,6 @@ enum {
 #define dmz_is_empty(z)		((z)->wp_block == 0)
 #define dmz_is_offline(z)	test_bit(DMZ_OFFLINE, &(z)->flags)
 #define dmz_is_readonly(z)	test_bit(DMZ_READ_ONLY, &(z)->flags)
-#define dmz_is_active(z)	test_bit(DMZ_ACTIVE, &(z)->flags)
 #define dmz_in_reclaim(z)	test_bit(DMZ_RECLAIM, &(z)->flags)
 #define dmz_seq_write_err(z)	test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
 
@@ -188,8 +186,30 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
 
-void dmz_activate_zone(struct dm_zone *zone);
-void dmz_deactivate_zone(struct dm_zone *zone);
+/*
+ * Activate a zone (increment its reference count).
+ */
+static inline void dmz_activate_zone(struct dm_zone *zone)
+{
+	atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+	atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+	return atomic_read(&zone->refcount);
+}
 
 int dmz_lock_zone_reclaim(struct dm_zone *zone);
 void dmz_unlock_zone_reclaim(struct dm_zone *zone);
-- 
cgit v1.2.3-59-g8ed1b


From 3ee25485ba8e8271fe9401eef5003c20ab648ddf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jul 2019 11:12:30 -0400
Subject: dm snapshot: fix oversights in optional discard support

__find_snapshots_sharing_cow() should always be used with _origins_lock
held so fix snapshot_io_hints() accordingly.  Also, once a snapshot is
being merged discards must not be allowed -- otherwise incorrect or
duplicate work will be performed.

Fixes: 2e6023850e177d ("dm snapshot: add optional discard support features")
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 63916e1dc569..f150f5c5492b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2072,6 +2072,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+		/* Once merging, discards no longer effect change */
+		bio_endio(bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
 	down_write(&s->lock);
@@ -2331,6 +2337,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	if (snap->discard_zeroes_cow) {
 		struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
+		down_read(&_origins_lock);
+
 		(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
 		if (snap_src && snap_dest)
 			snap = snap_src;
@@ -2338,6 +2346,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
 		/* All discards are split on chunk_size boundary */
 		limits->discard_granularity = snap->store->chunk_size;
 		limits->max_discard_sectors = snap->store->chunk_size;
+
+		up_read(&_origins_lock);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c663e04097f4e286fc146f79eb5ef6a47c01d337 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 17 Jul 2019 14:24:10 +0300
Subject: dm kcopyd: Increase default sub-job size to 512KB

Currently, kcopyd has a sub-job size of 64KB and a maximum number of 8
sub-jobs. As a result, for any kcopyd job, we have a maximum of 512KB of
I/O in flight.

This upper limit to the amount of in-flight I/O under-utilizes fast
devices and results in decreased throughput, e.g., when writing to a
snapshotted thin LV with I/O size less than the pool's block size (so
COW is performed using kcopyd).

Increase kcopyd's default sub-job size to 512KB, so we have a maximum of
4MB of I/O in flight for each kcopyd job. This results in an up to 96%
improvement of bandwidth when writing to a snapshotted thin LV, with I/O
sizes less than the pool's block size.

Also, add dm_mod.kcopyd_subjob_size_kb module parameter to allow users
to fine tune the sub-job size of kcopyd. The default value of this
parameter is 512KB and the maximum allowed value is 1024KB.

We evaluate the performance impact of the change by running the
snap_breaking_throughput benchmark, from the device mapper test suite
[1].

The benchmark:

  1. Creates a 1G thin LV
  2. Provisions the thin LV
  3. Takes a snapshot of the thin LV
  4. Writes to the thin LV with:

      dd if=/dev/zero of=/dev/vg/thin_lv oflag=direct bs=<I/O size>

Running this benchmark with various thin pool block sizes and dd I/O
sizes (all combinations triggering the use of kcopyd) we get the
following results:

+-----------------+-------------+------------------+-----------------+
| Pool block size | dd I/O size | BW before (MB/s) | BW after (MB/s) |
+-----------------+-------------+------------------+-----------------+
|       1 MB      |      256 KB |       242        |       280       |
|       1 MB      |      512 KB |       238        |       295       |
|                 |             |                  |                 |
|       2 MB      |      256 KB |       238        |       354       |
|       2 MB      |      512 KB |       241        |       380       |
|       2 MB      |        1 MB |       245        |       394       |
|                 |             |                  |                 |
|       4 MB      |      256 KB |       248        |       412       |
|       4 MB      |      512 KB |       234        |       432       |
|       4 MB      |        1 MB |       251        |       474       |
|       4 MB      |        2 MB |       257        |       504       |
|                 |             |                  |                 |
|       8 MB      |      256 KB |       239        |       420       |
|       8 MB      |      512 KB |       256        |       431       |
|       8 MB      |        1 MB |       264        |       467       |
|       8 MB      |        2 MB |       264        |       502       |
|       8 MB      |        4 MB |       281        |       537       |
+-----------------+-------------+------------------+-----------------+

[1] https://github.com/jthornber/device-mapper-test-suite

Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-kcopyd.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 671c24332802..df2011de7be2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -28,10 +28,27 @@
 
 #include "dm-core.h"
 
-#define SUB_JOB_SIZE	128
 #define SPLIT_COUNT	8
 #define MIN_JOBS	8
-#define RESERVE_PAGES	(DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
+
+#define DEFAULT_SUB_JOB_SIZE_KB 512
+#define MAX_SUB_JOB_SIZE_KB     1024
+
+static unsigned kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;
+
+module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");
+
+static unsigned dm_get_kcopyd_subjob_size(void)
+{
+	unsigned sub_job_size_kb;
+
+	sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
+						DEFAULT_SUB_JOB_SIZE_KB,
+						MAX_SUB_JOB_SIZE_KB);
+
+	return sub_job_size_kb << 1;
+}
 
 /*-----------------------------------------------------------------
  * Each kcopyd client has its own little pool of preallocated
@@ -41,6 +58,7 @@ struct dm_kcopyd_client {
 	struct page_list *pages;
 	unsigned nr_reserved_pages;
 	unsigned nr_free_pages;
+	unsigned sub_job_size;
 
 	struct dm_io_client *io_client;
 
@@ -693,8 +711,8 @@ static void segment_complete(int read_err, unsigned long write_err,
 		progress = job->progress;
 		count = job->source.count - progress;
 		if (count) {
-			if (count > SUB_JOB_SIZE)
-				count = SUB_JOB_SIZE;
+			if (count > kc->sub_job_size)
+				count = kc->sub_job_size;
 
 			job->progress += count;
 		}
@@ -821,7 +839,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->master_job = job;
 	job->write_offset = 0;
 
-	if (job->source.count <= SUB_JOB_SIZE)
+	if (job->source.count <= kc->sub_job_size)
 		dispatch_job(job);
 	else {
 		job->progress = 0;
@@ -888,6 +906,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 {
 	int r;
+	unsigned reserve_pages;
 	struct dm_kcopyd_client *kc;
 
 	kc = kzalloc(sizeof(*kc), GFP_KERNEL);
@@ -912,9 +931,12 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
 		goto bad_workqueue;
 	}
 
+	kc->sub_job_size = dm_get_kcopyd_subjob_size();
+	reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);
+
 	kc->pages = NULL;
 	kc->nr_reserved_pages = kc->nr_free_pages = 0;
-	r = client_reserve_pages(kc, RESERVE_PAGES);
+	r = client_reserve_pages(kc, reserve_pages);
 	if (r)
 		goto bad_client_pages;
 
-- 
cgit v1.2.3-59-g8ed1b