From 0dd69643061d78f3f9047c2382d8d77cca1ac943 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 27 Jun 2017 19:56:33 +1000
Subject: libnvdimm: Stop using HPAGE_SIZE

Currently libnvdimm uses HPAGE_SIZE as the default alignment for DAX and
PFN devices. HPAGE_SIZE is the default hugetlbfs page size and when
hugetlbfs is disabled it defaults to PAGE_SIZE. Given DAX has more
in common with THP than hugetlbfs we should proably be using
HPAGE_PMD_SIZE, but this is undefined when THP is disabled so lets just
give it a new name.

The other usage of HPAGE_SIZE in libnvdimm is when determining how large
the altmap should be. For the reasons mentioned above it doesn't really
make sense to use HPAGE_SIZE here either. PMD_SIZE seems to be safe to
use in generic code and it happens to match the vmemmap allocation block
on x86 and Power. It's still a hack, but it's a slightly nicer hack.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/nd.h       | 7 +++++++
 drivers/nvdimm/pfn_devs.c | 9 +++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e1b5715bd91f..e9fa9e84b364 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -285,6 +285,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
+#else
+#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
+#endif
+
 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 5fcb6f5b22a2..2ae9a000b090 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -290,7 +290,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		return NULL;
 
 	nd_pfn->mode = PFN_MODE_NONE;
-	nd_pfn->align = HPAGE_SIZE;
+	nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
 	dev = &nd_pfn->dev;
 	device_initialize(&nd_pfn->dev);
 	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -638,11 +638,12 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 			/ PAGE_SIZE);
 	if (nd_pfn->mode == PFN_MODE_PMEM) {
 		/*
-		 * vmemmap_populate_hugepages() allocates the memmap array in
-		 * HPAGE_SIZE chunks.
+		 * The altmap should be padded out to the block size used
+		 * when populating the vmemmap. This *should* be equal to
+		 * PMD_SIZE for most architectures.
 		 */
 		offset = ALIGN(start + SZ_8K + 64 * npfns + dax_label_reserve,
-				max(nd_pfn->align, HPAGE_SIZE)) - start;
+				max(nd_pfn->align, PMD_SIZE)) - start;
 	} else if (nd_pfn->mode == PFN_MODE_RAM)
 		offset = ALIGN(start + SZ_8K + dax_label_reserve,
 				nd_pfn->align) - start;
-- 
cgit v1.2.3-59-g8ed1b


From 401c0a19c6c22efcaff85d5a64a396f9130da2ca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 4 Aug 2017 17:20:16 -0700
Subject: nfit, libnvdimm, region: export 'position' in mapping info

It is useful to be able to know the position of a DIMM in an
interleave-set. Consider the case where the order of the DIMMs changes
causing a namespace to be invalidated because the interleave-set cookie no
longer matches. If the before and after state of each DIMM position is
known this state debugged by the system owner.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c     | 24 ++++++++++++++++++++++++
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/region_devs.c |  6 ++++--
 include/linux/libnvdimm.h    |  1 +
 4 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..be231a549eb0 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1835,6 +1835,30 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 			cmp_map_compat, NULL);
 	nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
 
+	/* record the result of the sort for the mapping position */
+	for (i = 0; i < nr; i++) {
+		struct nfit_set_info_map2 *map2 = &info2->mapping[i];
+		int j;
+
+		for (j = 0; j < nr; j++) {
+			struct nd_mapping_desc *mapping = &ndr_desc->mapping[j];
+			struct nvdimm *nvdimm = mapping->nvdimm;
+			struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+			if (map2->serial_number
+				== nfit_mem->dcr->serial_number &&
+			    map2->vendor_id
+				== nfit_mem->dcr->vendor_id &&
+			    map2->manufacturing_date
+				== nfit_mem->dcr->manufacturing_date &&
+			    map2->manufacturing_location
+				== nfit_mem->dcr->manufacturing_location) {
+				mapping->position = i;
+				break;
+			}
+		}
+	}
+
 	ndr_desc->nd_set = nd_set;
 	devm_kfree(dev, info);
 	devm_kfree(dev, info2);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e9fa9e84b364..a08fc2e24fb3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -134,6 +134,7 @@ struct nd_mapping {
 	struct nvdimm *nvdimm;
 	u64 start;
 	u64 size;
+	int position;
 	struct list_head labels;
 	struct mutex lock;
 	/*
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 5954cfbea3fc..829d760f651c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -723,8 +723,9 @@ static ssize_t mappingN(struct device *dev, char *buf, int n)
 	nd_mapping = &nd_region->mapping[n];
 	nvdimm = nd_mapping->nvdimm;
 
-	return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
-			nd_mapping->start, nd_mapping->size);
+	return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev),
+			nd_mapping->start, nd_mapping->size,
+			nd_mapping->position);
 }
 
 #define REGION_MAPPING(idx) \
@@ -965,6 +966,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		nd_region->mapping[i].nvdimm = nvdimm;
 		nd_region->mapping[i].start = mapping->start;
 		nd_region->mapping[i].size = mapping->size;
+		nd_region->mapping[i].position = mapping->position;
 		INIT_LIST_HEAD(&nd_region->mapping[i].labels);
 		mutex_init(&nd_region->mapping[i].lock);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6af8838..9b8d81a7b80e 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -87,6 +87,7 @@ struct nd_mapping_desc {
 	struct nvdimm *nvdimm;
 	u64 start;
 	u64 size;
+	int position;
 };
 
 struct nd_region_desc {
-- 
cgit v1.2.3-59-g8ed1b


From dcb79b15449fc3e4427a3d1cbcc72661ba13622d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 7 Aug 2017 14:27:57 -0700
Subject: nfit: cleanup long de-reference chains in
 acpi_nfit_init_interleave_set

Use a local 'struct acpi_nfit_control_region *' variable to shorten the
pointer chasing chains.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index be231a549eb0..2c5608b92578 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1804,6 +1804,7 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 		struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
 				spa->range_index, i);
+		struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
 
 		if (!memdev || !nfit_mem->dcr) {
 			dev_err(dev, "%s: failed to find DCR\n", __func__);
@@ -1811,13 +1812,13 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 		}
 
 		map->region_offset = memdev->region_offset;
-		map->serial_number = nfit_mem->dcr->serial_number;
+		map->serial_number = dcr->serial_number;
 
 		map2->region_offset = memdev->region_offset;
-		map2->serial_number = nfit_mem->dcr->serial_number;
-		map2->vendor_id = nfit_mem->dcr->vendor_id;
-		map2->manufacturing_date = nfit_mem->dcr->manufacturing_date;
-		map2->manufacturing_location = nfit_mem->dcr->manufacturing_location;
+		map2->serial_number = dcr->serial_number;
+		map2->vendor_id = dcr->vendor_id;
+		map2->manufacturing_date = dcr->manufacturing_date;
+		map2->manufacturing_location = dcr->manufacturing_location;
 	}
 
 	/* v1.1 namespaces */
@@ -1844,15 +1845,13 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 			struct nd_mapping_desc *mapping = &ndr_desc->mapping[j];
 			struct nvdimm *nvdimm = mapping->nvdimm;
 			struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+			struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
 
-			if (map2->serial_number
-				== nfit_mem->dcr->serial_number &&
-			    map2->vendor_id
-				== nfit_mem->dcr->vendor_id &&
-			    map2->manufacturing_date
-				== nfit_mem->dcr->manufacturing_date &&
+			if (map2->serial_number == dcr->serial_number &&
+			    map2->vendor_id == dcr->vendor_id &&
+			    map2->manufacturing_date == dcr->manufacturing_date &&
 			    map2->manufacturing_location
-				== nfit_mem->dcr->manufacturing_location) {
+				    == dcr->manufacturing_location) {
 				mapping->position = i;
 				break;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From b2c48f9f95cba395e16020bef1fdfc248f53030c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 11 Aug 2017 17:36:54 -0700
Subject: libnvdimm: rename nd_sector_size_{show,store} to
 nd_size_select_{show,store}

Prepare for other another consumer of this size selection scheme that is
not a 'sector size'.

Cc: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt_devs.c       |  4 ++--
 drivers/nvdimm/core.c           | 10 +++++-----
 drivers/nvdimm/namespace_devs.c |  6 +++---
 drivers/nvdimm/nd.h             |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 3e359d282f8e..d58925295aa7 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -61,7 +61,7 @@ static ssize_t sector_size_show(struct device *dev,
 {
 	struct nd_btt *nd_btt = to_nd_btt(dev);
 
-	return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+	return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf);
 }
 
 static ssize_t sector_size_store(struct device *dev,
@@ -72,7 +72,7 @@ static ssize_t sector_size_store(struct device *dev,
 
 	device_lock(dev);
 	nvdimm_bus_lock(dev);
-	rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize,
+	rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
 			btt_lbasize_supported);
 	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
 			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 75bc08c6838c..bb71f0cf8f5d 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -277,14 +277,14 @@ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 	return 0;
 }
 
-ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ssize_t nd_size_select_show(unsigned long current_size,
 		const unsigned long *supported, char *buf)
 {
 	ssize_t len = 0;
 	int i;
 
 	for (i = 0; supported[i]; i++)
-		if (current_lbasize == supported[i])
+		if (current_size == supported[i])
 			len += sprintf(buf + len, "[%ld] ", supported[i]);
 		else
 			len += sprintf(buf + len, "%ld ", supported[i]);
@@ -292,8 +292,8 @@ ssize_t nd_sector_size_show(unsigned long current_lbasize,
 	return len;
 }
 
-ssize_t nd_sector_size_store(struct device *dev, const char *buf,
-		unsigned long *current_lbasize, const unsigned long *supported)
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+		unsigned long *current_size, const unsigned long *supported)
 {
 	unsigned long lbasize;
 	int rc, i;
@@ -310,7 +310,7 @@ ssize_t nd_sector_size_store(struct device *dev, const char *buf,
 			break;
 
 	if (supported[i]) {
-		*current_lbasize = lbasize;
+		*current_size = lbasize;
 		return 0;
 	} else {
 		return -EINVAL;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 5f1c6756e57c..1427a386a033 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1313,14 +1313,14 @@ static ssize_t sector_size_show(struct device *dev,
 	if (is_namespace_blk(dev)) {
 		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
 
-		return nd_sector_size_show(nsblk->lbasize,
+		return nd_size_select_show(nsblk->lbasize,
 				blk_lbasize_supported, buf);
 	}
 
 	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
-		return nd_sector_size_show(nspm->lbasize,
+		return nd_size_select_show(nspm->lbasize,
 				pmem_lbasize_supported, buf);
 	}
 	return -ENXIO;
@@ -1352,7 +1352,7 @@ static ssize_t sector_size_store(struct device *dev,
 	if (to_ndns(dev)->claim)
 		rc = -EBUSY;
 	if (rc >= 0)
-		rc = nd_sector_size_store(dev, buf, lbasize, supported);
+		rc = nd_size_select_store(dev, buf, lbasize, supported);
 	if (rc >= 0)
 		rc = nd_namespace_label_update(nd_region, dev);
 	dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a08fc2e24fb3..251c7e6d2588 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -234,10 +234,10 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
 void nd_device_notify(struct device *dev, enum nvdimm_event event);
 int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 		size_t len);
-ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ssize_t nd_size_select_show(unsigned long current_size,
 		const unsigned long *supported, char *buf);
-ssize_t nd_sector_size_store(struct device *dev, const char *buf,
-		unsigned long *current_lbasize, const unsigned long *supported);
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+		unsigned long *current_size, const unsigned long *supported);
 int __init nvdimm_init(void);
 int __init nd_region_init(void);
 int __init nd_label_init(void);
-- 
cgit v1.2.3-59-g8ed1b


From 1fdadbebc4f617c1ee4a1465ad173cc9e524089d Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 27 Jun 2017 19:56:12 +1000
Subject: libnvdimm, pfn, dax: show supported dax/pfn region alignments in
 sysfs

The alignment of a DAX and PFN regions dictates the page sizes that can
be used to map the region. Even if the hardware page sizes are known the
actual range of supported page sizes that can be used with DAX depends
on the kernel configuration. As a result it's best that the kernel
advertises the alignments that should be used with these region types.

This patch adds the 'supported_alignments' region attribute to expose
this information to userspace.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
[djbw: integrate with nd_size_select_show() rename and other fixups]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 2ae9a000b090..c500531ca23b 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -111,6 +111,29 @@ static ssize_t align_show(struct device *dev,
 	return sprintf(buf, "%ld\n", nd_pfn->align);
 }
 
+static const unsigned long *nd_pfn_supported_alignments(void)
+{
+	/*
+	 * This needs to be a non-static variable because the *_SIZE
+	 * macros aren't always constants.
+	 */
+	const unsigned long supported_alignments[] = {
+		PAGE_SIZE,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		HPAGE_PMD_SIZE,
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+		HPAGE_PUD_SIZE,
+#endif
+#endif
+		0,
+	};
+	static unsigned long data[ARRAY_SIZE(supported_alignments)];
+
+	memcpy(data, supported_alignments, sizeof(data));
+
+	return data;
+}
+
 static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
 {
 	unsigned long val;
@@ -260,6 +283,13 @@ static ssize_t size_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(size);
 
+static ssize_t supported_alignments_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return nd_size_select_show(0, nd_pfn_supported_alignments(), buf);
+}
+static DEVICE_ATTR_RO(supported_alignments);
+
 static struct attribute *nd_pfn_attributes[] = {
 	&dev_attr_mode.attr,
 	&dev_attr_namespace.attr,
@@ -267,6 +297,7 @@ static struct attribute *nd_pfn_attributes[] = {
 	&dev_attr_align.attr,
 	&dev_attr_resource.attr,
 	&dev_attr_size.attr,
+	&dev_attr_supported_alignments.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f13d2b61e59cbdd813be7639eb85bfbf99593ac0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 11 Aug 2017 17:54:48 -0700
Subject: libnvdimm, pfn, dax: limit namespace alignments to the supported set

Now that we properly advertise the supported pte, pmd, and pud sizes,
restrict the supported alignments that can be set on a namespace. This
assumes that userspace was not previously relying on the ability to set
odd alignments. At least ndctl only ever supported setting the namespace
alignment to 4K, 2M, or 1G.

Cc: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index c500531ca23b..9576c444f0ab 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -134,26 +134,6 @@ static const unsigned long *nd_pfn_supported_alignments(void)
 	return data;
 }
 
-static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
-{
-	unsigned long val;
-	int rc;
-
-	rc = kstrtoul(buf, 0, &val);
-	if (rc)
-		return rc;
-
-	if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
-		return -EINVAL;
-
-	if (nd_pfn->dev.driver)
-		return -EBUSY;
-	else
-		nd_pfn->align = val;
-
-	return 0;
-}
-
 static ssize_t align_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -162,7 +142,8 @@ static ssize_t align_store(struct device *dev,
 
 	device_lock(dev);
 	nvdimm_bus_lock(dev);
-	rc = __align_store(nd_pfn, buf);
+	rc = nd_size_select_store(dev, buf, &nd_pfn->align,
+			nd_pfn_supported_alignments());
 	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
 			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 02881768695da29772f6f9e0d857a8637c6b0e90 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 29 Aug 2017 18:28:18 -0700
Subject: libnvdimm, label: fix index block size calculation

The old calculation assumed that the label space was 128k and the label
size is 128. With v1.2 labels where the label size is 256 this
calculation will return zero. We are saved by the fact that the
nsindex_size is always pre-initialized from a previous 128 byte
assumption and we are lucky that the index sizes turn out the same.

Fix this going forward in case we start encountering different
geometries of label areas besides 128k.

Since the label size can change from one call to the next, drop the
caching of nsindex_size.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/label.c | 30 ++++++++++++++++--------------
 drivers/nvdimm/nd.h    |  2 +-
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 87796f840777..9c5f108910e3 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -45,12 +45,14 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
 	return ndd->nslabel_size;
 }
 
-size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
 {
-	u32 index_span;
+	return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
+}
 
-	if (ndd->nsindex_size)
-		return ndd->nsindex_size;
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+{
+	u32 nslot, space, size;
 
 	/*
 	 * The minimum index space is 512 bytes, with that amount of
@@ -60,16 +62,16 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
 	 * starts to waste space at larger config_sizes, but it's
 	 * unlikely we'll ever see anything but 128K.
 	 */
-	index_span = ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
-	index_span /= NSINDEX_ALIGN * 2;
-	ndd->nsindex_size = index_span * NSINDEX_ALIGN;
-
-	return ndd->nsindex_size;
-}
-
-int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
-{
-	return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
+	nslot = nvdimm_num_label_slots(ndd);
+	space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd);
+	size = ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8),
+			NSINDEX_ALIGN) * 2;
+	if (size <= space)
+		return size / 2;
+
+	dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n",
+			ndd->nsarea.config_size, sizeof_namespace_label(ndd));
+	return 0;
 }
 
 static int __nd_label_validate(struct nvdimm_drvdata *ndd)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 251c7e6d2588..023fc93e21a5 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -42,7 +42,7 @@ struct nd_poison {
 
 struct nvdimm_drvdata {
 	struct device *dev;
-	int nsindex_size, nslabel_size;
+	int nslabel_size;
 	struct nd_cmd_get_config_size nsarea;
 	void *data;
 	int ns_current, ns_next;
-- 
cgit v1.2.3-59-g8ed1b


From ed36b4dba54a421ce5551638f6a9790b2c2116b1 Mon Sep 17 00:00:00 2001
From: Christophe Jaillet <christophe.jaillet@wanadoo.fr>
Date: Sun, 27 Aug 2017 08:30:34 +0200
Subject: libnvdimm, btt: check memory allocation failure

Check memory allocation failures and return -ENOMEM in such cases, as
already done few lines below for another memory allocation.

This avoids NULL pointers dereference.

Cc: <stable@vger.kernel.org>
Fixes: 14e494542636 ("libnvdimm, btt: BTT updates for UEFI 2.7 format")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 14323faf8bd9..7ec6393b6ba1 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1429,6 +1429,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
 	}
 
 	btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+	if (!btt_sb)
+		return -ENOMEM;
 
 	/*
 	 * If this returns < 0, that is ok as it just means there wasn't
-- 
cgit v1.2.3-59-g8ed1b


From 78f35473508118df5ea04b9515ac3f1aaec0a980 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 30 Aug 2017 09:16:38 -0700
Subject: dax: introduce a fs_dax_get_by_bdev() helper

Add a helper that can replace the following common pattern:

	if (blk_queue_dax(bdev->bd_queue))
		fs_dax_get_by_host(bdev->bd_disk->disk_name);

This will be used to move dax_device lookup from iomap-operation time to
fs-mount time.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 10 ++++++++++
 include/linux/dax.h |  6 ++++++
 2 files changed, 16 insertions(+)

(limited to 'drivers')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 938eb4868f7f..b699aac268a6 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -46,6 +46,8 @@ void dax_read_unlock(int id)
 EXPORT_SYMBOL_GPL(dax_read_unlock);
 
 #ifdef CONFIG_BLOCK
+#include <linux/blkdev.h>
+
 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 		pgoff_t *pgoff)
 {
@@ -59,6 +61,14 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 }
 EXPORT_SYMBOL(bdev_dax_pgoff);
 
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+	if (!blk_queue_dax(bdev->bd_queue))
+		return NULL;
+	return fs_dax_get_by_host(bdev->bd_disk->disk_name);
+}
+EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
 /**
  * __bdev_dax_supported() - Check if the device supports dax for filesystem
  * @sb: The superblock of the device
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..ac8afa18f707 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -57,6 +57,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 	put_dax(dax_dev);
 }
 
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
@@ -71,6 +72,11 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host)
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
+
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+	return NULL;
+}
 #endif
 
 int dax_read_lock(void);
-- 
cgit v1.2.3-59-g8ed1b


From a15797f4bef201544263ef5c264c3f48d78cc5d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 31 Aug 2017 12:53:36 -0700
Subject: libnvdimm, nfit: export an 'ecc_unit_size' sysfs attribute

When the nfit driver initializes it runs an ARS (Address Range Scrub)
operation across every pmem range. Part of that process involves
determining the ARS capabilities of a given address range. One of the
capabilities that is reported is the 'Clear Uncorrectable Error Range
Length Unit Size' (see: ACPI 6.2 section 9.20.7.4 Function Index 1 -
Query ARS Capabilities). This property is of interest to userspace
software as it indicates the boundary at which the NVDIMM may need to
perform read-modify-write cycles to maintain ECC blocks.

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'drivers')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 2c5608b92578..03105648f9b1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1674,8 +1674,19 @@ static ssize_t range_index_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(range_index);
 
+static ssize_t ecc_unit_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+
+	return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
+}
+static DEVICE_ATTR_RO(ecc_unit_size);
+
 static struct attribute *acpi_nfit_region_attributes[] = {
 	&dev_attr_range_index.attr,
+	&dev_attr_ecc_unit_size.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 1db1f3cea1d8886c686832d4618b346ae16c03c8 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:35:58 -0600
Subject: libnvdimm, btt: fix a missed NVDIMM_IO_ATOMIC case in the write path

The IO context conversion for rw_bytes missed a case in the BTT write
path (btt_map_write) which should've been marked as atomic.

In reality this should not cause a problem, because map writes are to
small for nsio_rw_bytes to attempt error clearing, but it should be
fixed for posterity.

Add a might_sleep() in the non-atomic section of nsio_rw_bytes so that
things like the nfit unit tests, which don't actually sleep, can catch
bugs like this.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c   | 3 ++-
 drivers/nvdimm/claim.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 7ec6393b6ba1..a5e4134e1ed0 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1156,7 +1156,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		if (ret)
 			goto out_map;
 
-		ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0);
+		ret = btt_map_write(arena, premap, new_postmap, 0, 0,
+			NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_map;
 
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 47770460f3d3..3e6404f1ba5a 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -292,6 +292,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 				&& !(flags & NVDIMM_IO_ATOMIC)) {
 			long cleared;
 
+			might_sleep();
 			cleared = nvdimm_clear_poison(&ndns->dev,
 					nsio->res.start + offset, size);
 			if (cleared < size)
-- 
cgit v1.2.3-59-g8ed1b


From 0595d539a5deb4f495618ebbed96db59ae635e32 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:35:59 -0600
Subject: libnvdimm, btt: refactor map entry operations with macros

Add helpers for converting a raw map entry to just the block number, or
either of the 'e' or 'z' flags in preparation for actually using the
error flag to mark blocks with media errors.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 8 ++++----
 drivers/nvdimm/btt.h | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index a5e4134e1ed0..bb816bc1a906 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -106,7 +106,7 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 	 * This 'mapping' is supposed to be just the LBA mapping, without
 	 * any flags set, so strip the flag bits.
 	 */
-	mapping &= MAP_LBA_MASK;
+	mapping = ent_lba(mapping);
 
 	ze = (z_flag << 1) + e_flag;
 	switch (ze) {
@@ -155,10 +155,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 
 	raw_mapping = le32_to_cpu(in);
 
-	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
-	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+	z_flag = ent_z_flag(raw_mapping);
+	e_flag = ent_e_flag(raw_mapping);
 	ze = (z_flag << 1) + e_flag;
-	postmap = raw_mapping & MAP_LBA_MASK;
+	postmap = ent_lba(raw_mapping);
 
 	/* Reuse the {z,e}_flag variables for *trim and *error */
 	z_flag = 0;
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 888e862907a0..09fabf5a5590 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -38,6 +38,10 @@
 #define IB_FLAG_ERROR 0x00000001
 #define IB_FLAG_ERROR_MASK 0x00000001
 
+#define ent_lba(ent) (ent & MAP_LBA_MASK)
+#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK))
+#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK))
+
 enum btt_init_state {
 	INIT_UNCHECKED = 0,
 	INIT_NOTFOUND,
-- 
cgit v1.2.3-59-g8ed1b


From 1398199d849047c59acef0c04e550b47bb9b4be6 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:36:00 -0600
Subject: libnvdimm, btt: ensure that flags were also unchanged during a
 map_read

In btt_map_read, we read the map twice to make sure that the map entry
didn't change after we added it to the read tracking table. In
anticipation of expanding the use of the error bit, also make sure that
the error and zero flags are constant across the two map reads.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index bb816bc1a906..15d1b071746b 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1032,6 +1032,7 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		 */
 		while (1) {
 			u32 new_map;
+			int new_t, new_e;
 
 			if (t_flag) {
 				zero_fill_data(page, off, cur_len);
@@ -1050,15 +1051,18 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			 */
 			barrier();
 
-			ret = btt_map_read(arena, premap, &new_map, &t_flag,
-						&e_flag, NVDIMM_IO_ATOMIC);
+			ret = btt_map_read(arena, premap, &new_map, &new_t,
+						&new_e, NVDIMM_IO_ATOMIC);
 			if (ret)
 				goto out_rtt;
 
-			if (postmap == new_map)
+			if ((postmap == new_map) && (t_flag == new_t) &&
+					(e_flag == new_e))
 				break;
 
 			postmap = new_map;
+			t_flag = new_t;
+			e_flag = new_e;
 		}
 
 		ret = btt_data_read(arena, page, off, postmap, cur_len);
-- 
cgit v1.2.3-59-g8ed1b


From 75892004508260df72ed3d319f10d2acd516220e Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:36:01 -0600
Subject: libnvdimm, btt: cache sector_size in arena_info

In preparation for the error clearing rework, add sector_size in the
arena_info struct.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 1 +
 drivers/nvdimm/btt.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 15d1b071746b..9c96530ea6d5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -566,6 +566,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	if (!arena)
 		return NULL;
 	arena->nd_btt = btt->nd_btt;
+	arena->sector_size = btt->sector_size;
 
 	if (!size)
 		return arena;
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 09fabf5a5590..2bc0d10b8438 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -108,6 +108,7 @@ struct aligned_lock {
  *			handle incoming writes.
  * @version_major:	Metadata layout version major.
  * @version_minor:	Metadata layout version minor.
+ * @sector_size:	The Linux sector size - 512 or 4096
  * @nextoff:		Offset in bytes to the start of the next arena.
  * @infooff:		Offset in bytes to the info block of this arena.
  * @dataoff:		Offset in bytes to the data area of this arena.
@@ -135,6 +136,7 @@ struct arena_info {
 	u32 nfree;
 	u16 version_major;
 	u16 version_minor;
+	u32 sector_size;
 	/* Byte offsets to the different on-media structures */
 	u64 nextoff;
 	u64 infooff;
-- 
cgit v1.2.3-59-g8ed1b


From 0930a750c35be3c2f5aacebc0d20ddeaf727c208 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:36:02 -0600
Subject: libnvdimm: fix potential deadlock while clearing errors

With the ACPI NFIT 'DSM' methods, acpi can be called from IO paths.
Specifically, the DSM to clear media errors is called during writes, so
that we can provide a writes-fix-errors model.

However it is easy to imagine a scenario like:
 -> write through the nvdimm driver
   -> acpi allocation
     -> writeback, causes more IO through the nvdimm driver
       -> deadlock

Fix this by using memalloc_noio_{save,restore}, which sets the GFP_NOIO
flag for the current scope when issuing commands/IOs that are expected
to clear errors.

Cc: <linux-acpi@vger.kernel.org>
Cc: <linux-nvdimm@lists.01.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 937fafa1886a..a18c2914f4b6 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/sched/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
@@ -234,6 +235,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	struct nd_cmd_clear_error clear_err;
 	struct nd_cmd_ars_cap ars_cap;
 	u32 clear_err_unit, mask;
+	unsigned int noio_flag;
 	int cmd_rc, rc;
 
 	if (!nvdimm_bus)
@@ -250,8 +252,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	memset(&ars_cap, 0, sizeof(ars_cap));
 	ars_cap.address = phys;
 	ars_cap.length = len;
+	noio_flag = memalloc_noio_save();
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
 			sizeof(ars_cap), &cmd_rc);
+	memalloc_noio_restore(noio_flag);
 	if (rc < 0)
 		return rc;
 	if (cmd_rc < 0)
@@ -266,8 +270,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	memset(&clear_err, 0, sizeof(clear_err));
 	clear_err.address = phys;
 	clear_err.length = len;
+	noio_flag = memalloc_noio_save();
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
 			sizeof(clear_err), &cmd_rc);
+	memalloc_noio_restore(noio_flag);
 	if (rc < 0)
 		return rc;
 	if (cmd_rc < 0)
-- 
cgit v1.2.3-59-g8ed1b


From d9b83c7569536e3255992491737d9f895640ea18 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 30 Aug 2017 19:36:03 -0600
Subject: libnvdimm, btt: rework error clearing

Clearing errors or badblocks during a BTT write requires sending an ACPI
DSM, which means potentially sleeping. Since a BTT IO happens in atomic
context (preemption disabled, spinlocks may be held), we cannot perform
error clearing in the course of an IO. Due to this error clearing for
BTT IOs has hitherto been disabled.

In this patch we move error clearing out of the atomic section, and thus
re-enable error clearing with BTTs. When we are about to add a block to
the free list, we check if it was previously marked as an error, and if
it was, we add it to the freelist, but also set a flag that says error
clearing will be required. We then drop the lane (ending the atomic
context), and send a zero buffer so that the error can be cleared. The
error flag in the free list is protected by the nd 'lane', and is set
only be a thread while it holds that lane. When the error is cleared,
the flag is cleared, but while holding a mutex for that freelist index.

When writing, we check for two things -
1/ If the freelist mutex is held or if the error flag is set. If so,
this is an error block that is being (or about to be) cleared.
2/ If the block is a known badblock based on nsio->bb

The second check is required because the BTT map error flag for a map
entry only gets set when an error LBA is read. If we write to a new
location that may not have the map error flag set, but still might be in
the region's badblock list, we can trigger an EIO on the write, which is
undesirable and completely avoidable.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c   | 117 ++++++++++++++++++++++++++++++++++++++++++++-----
 drivers/nvdimm/btt.h   |   5 +++
 drivers/nvdimm/claim.c |   8 ----
 3 files changed, 111 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 9c96530ea6d5..dabb84f7ab8a 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -31,6 +31,11 @@ enum log_ent_request {
 	LOG_OLD_ENT
 };
 
+static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
+{
+	return offset + nd_btt->initial_offset;
+}
+
 static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
 		void *buf, size_t n, unsigned long flags)
 {
@@ -38,7 +43,7 @@ static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets may be shifted from the base of the device */
-	offset += arena->nd_btt->initial_offset;
+	offset = adjust_initial_offset(nd_btt, offset);
 	return nvdimm_read_bytes(ndns, offset, buf, n, flags);
 }
 
@@ -49,7 +54,7 @@ static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets may be shifted from the base of the device */
-	offset += arena->nd_btt->initial_offset;
+	offset = adjust_initial_offset(nd_btt, offset);
 	return nvdimm_write_bytes(ndns, offset, buf, n, flags);
 }
 
@@ -381,7 +386,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
 	arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
 	if (++(arena->freelist[lane].seq) == 4)
 		arena->freelist[lane].seq = 1;
-	arena->freelist[lane].block = le32_to_cpu(ent->old_map);
+	if (ent_e_flag(ent->old_map))
+		arena->freelist[lane].has_err = 1;
+	arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
 
 	return ret;
 }
@@ -480,6 +487,40 @@ static int btt_log_init(struct arena_info *arena)
 	return ret;
 }
 
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+	return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
+{
+	int ret = 0;
+
+	if (arena->freelist[lane].has_err) {
+		void *zero_page = page_address(ZERO_PAGE(0));
+		u32 lba = arena->freelist[lane].block;
+		u64 nsoff = to_namespace_offset(arena, lba);
+		unsigned long len = arena->sector_size;
+
+		mutex_lock(&arena->err_lock);
+
+		while (len) {
+			unsigned long chunk = min(len, PAGE_SIZE);
+
+			ret = arena_write_bytes(arena, nsoff, zero_page,
+				chunk, 0);
+			if (ret)
+				break;
+			len -= chunk;
+			nsoff += chunk;
+			if (len == 0)
+				arena->freelist[lane].has_err = 0;
+		}
+		mutex_unlock(&arena->err_lock);
+	}
+	return ret;
+}
+
 static int btt_freelist_init(struct arena_info *arena)
 {
 	int old, new, ret;
@@ -505,6 +546,16 @@ static int btt_freelist_init(struct arena_info *arena)
 		arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
 		arena->freelist[i].block = le32_to_cpu(log_new.old_map);
 
+		/*
+		 * FIXME: if error clearing fails during init, we want to make
+		 * the BTT read-only
+		 */
+		if (ent_e_flag(log_new.old_map)) {
+			ret = arena_clear_freelist_error(arena, i);
+			if (ret)
+				WARN_ONCE(1, "Unable to clear known errors\n");
+		}
+
 		/* This implies a newly created or untouched flog entry */
 		if (log_new.old_map == log_new.new_map)
 			continue;
@@ -525,7 +576,6 @@ static int btt_freelist_init(struct arena_info *arena)
 			if (ret)
 				return ret;
 		}
-
 	}
 
 	return 0;
@@ -695,6 +745,7 @@ static int discover_arenas(struct btt *btt)
 		arena->external_lba_start = cur_nlba;
 		parse_arena_meta(arena, super, cur_off);
 
+		mutex_init(&arena->err_lock);
 		ret = btt_freelist_init(arena);
 		if (ret)
 			goto out;
@@ -905,11 +956,6 @@ static void unlock_map(struct arena_info *arena, u32 premap)
 	spin_unlock(&arena->map_locks[idx].lock);
 }
 
-static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
-{
-	return arena->dataoff + ((u64)lba * arena->internal_lbasize);
-}
-
 static int btt_data_read(struct arena_info *arena, struct page *page,
 			unsigned int off, u32 lba, u32 len)
 {
@@ -1067,8 +1113,14 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		}
 
 		ret = btt_data_read(arena, page, off, postmap, cur_len);
-		if (ret)
+		if (ret) {
+			int rc;
+
+			/* Media error - set the e_flag */
+			rc = btt_map_write(arena, premap, postmap, 0, 1,
+				NVDIMM_IO_ATOMIC);
 			goto out_rtt;
+		}
 
 		if (bip) {
 			ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
@@ -1093,6 +1145,21 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 	return ret;
 }
 
+/*
+ * Normally, arena_{read,write}_bytes will take care of the initial offset
+ * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
+ * we need the final, raw namespace offset here
+ */
+static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
+		u32 postmap)
+{
+	u64 nsoff = adjust_initial_offset(arena->nd_btt,
+			to_namespace_offset(arena, postmap));
+	sector_t phys_sector = nsoff >> 9;
+
+	return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
+}
+
 static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			sector_t sector, struct page *page, unsigned int off,
 			unsigned int len)
@@ -1105,7 +1172,9 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 
 	while (len) {
 		u32 cur_len;
+		int e_flag;
 
+ retry:
 		lane = nd_region_acquire_lane(btt->nd_region);
 
 		ret = lba_to_arena(btt, sector, &premap, &arena);
@@ -1118,6 +1187,21 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			goto out_lane;
 		}
 
+		if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
+			arena->freelist[lane].has_err = 1;
+
+		if (mutex_is_locked(&arena->err_lock)
+				|| arena->freelist[lane].has_err) {
+			nd_region_release_lane(btt->nd_region, lane);
+
+			ret = arena_clear_freelist_error(arena, lane);
+			if (ret)
+				return ret;
+
+			/* OK to acquire a different lane/free block */
+			goto retry;
+		}
+
 		new_postmap = arena->freelist[lane].block;
 
 		/* Wait if the new block is being read from */
@@ -1143,7 +1227,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		}
 
 		lock_map(arena, premap);
-		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL,
+		ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
 				NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_map;
@@ -1151,6 +1235,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			ret = -EIO;
 			goto out_map;
 		}
+		if (e_flag)
+			set_e_flag(old_postmap);
 
 		log.lba = cpu_to_le32(premap);
 		log.old_map = cpu_to_le32(old_postmap);
@@ -1169,6 +1255,12 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		unlock_map(arena, premap);
 		nd_region_release_lane(btt->nd_region, lane);
 
+		if (e_flag) {
+			ret = arena_clear_freelist_error(arena, lane);
+			if (ret)
+				return ret;
+		}
+
 		len -= cur_len;
 		off += cur_len;
 		sector += btt->sector_size >> SECTOR_SHIFT;
@@ -1349,6 +1441,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 {
 	int ret;
 	struct btt *btt;
+	struct nd_namespace_io *nsio;
 	struct device *dev = &nd_btt->dev;
 
 	btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
@@ -1362,6 +1455,8 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 	INIT_LIST_HEAD(&btt->arena_list);
 	mutex_init(&btt->init_lock);
 	btt->nd_region = nd_region;
+	nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
+	btt->phys_bb = &nsio->bb;
 
 	ret = discover_arenas(btt);
 	if (ret) {
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 2bc0d10b8438..578c2057524d 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -15,6 +15,7 @@
 #ifndef _LINUX_BTT_H
 #define _LINUX_BTT_H
 
+#include <linux/badblocks.h>
 #include <linux/types.h>
 
 #define BTT_SIG_LEN 16
@@ -41,6 +42,7 @@
 #define ent_lba(ent) (ent & MAP_LBA_MASK)
 #define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK))
 #define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK))
+#define set_e_flag(ent) (ent |= MAP_ERR_MASK)
 
 enum btt_init_state {
 	INIT_UNCHECKED = 0,
@@ -82,6 +84,7 @@ struct free_entry {
 	u32 block;
 	u8 sub;
 	u8 seq;
+	u8 has_err;
 };
 
 struct aligned_lock {
@@ -153,6 +156,7 @@ struct arena_info {
 	struct dentry *debugfs_dir;
 	/* Arena flags */
 	u32 flags;
+	struct mutex err_lock;
 };
 
 /**
@@ -187,6 +191,7 @@ struct btt {
 	struct mutex init_lock;
 	int init_state;
 	int num_arenas;
+	struct badblocks *phys_bb;
 };
 
 bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 3e6404f1ba5a..b2fc29b8279b 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -280,14 +280,6 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
-		/*
-		 * FIXME: nsio_rw_bytes() may be called from atomic
-		 * context in the btt case and the ACPI DSM path for
-		 * clearing the error takes sleeping locks and allocates
-		 * memory. An explicit error clearing path, and support
-		 * for tracking badblocks in BTT metadata is needed to
-		 * work around this collision.
-		 */
 		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
 				&& !(flags & NVDIMM_IO_ATOMIC)) {
 			long cleared;
-- 
cgit v1.2.3-59-g8ed1b


From 5deb67f77a266010e2c10fb124b7516d0d258ce8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 31 Aug 2017 12:27:09 +0100
Subject: libnvdimm, nd_blk: remove mmio_flush_range()

mmio_flush_range() suffers from a lack of clearly-defined semantics,
and is somewhat ambiguous to port to other architectures where the
scope of the writeback implied by "flush" and ordering might matter,
but MMIO would tend to imply non-cacheable anyway. Per the rationale
in 67a3e8fe9015 ("nd_blk: change aperture mapping from WC to WB"), the
only existing use is actually to invalidate clean cache lines for
ARCH_MEMREMAP_PMEM type mappings *without* writeback. Since the recent
cleanup of the pmem API, that also now happens to be the exact purpose
of arch_invalidate_pmem(), which would be a far more well-defined tool
for the job.

Rather than risk potentially inconsistent implementations of
mmio_flush_range() for the sake of one callsite, streamline things by
removing it entirely and instead move the ARCH_MEMREMAP_PMEM related
definitions up to the libnvdimm level, so they can be shared by NFIT
as well. This allows NFIT to be enabled for arm64.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                  |  1 -
 arch/x86/include/asm/cacheflush.h |  2 --
 drivers/acpi/nfit/Kconfig         |  2 +-
 drivers/acpi/nfit/core.c          |  2 +-
 drivers/nvdimm/pmem.h             | 14 --------------
 include/linux/libnvdimm.h         | 15 +++++++++++++++
 lib/Kconfig                       |  3 ---
 tools/testing/nvdimm/test/nfit.c  |  4 ++--
 8 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..5f3b756ec0d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,7 +53,6 @@ config X86
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
-	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 8b4140f6724f..cb9a1af109b4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,6 +7,4 @@
 
 void clflush_cache_range(void *addr, unsigned int size);
 
-#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index 6d3351452ea2..929ba4da0b30 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -2,7 +2,7 @@ config ACPI_NFIT
 	tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
 	depends on PHYS_ADDR_T_64BIT
 	depends on BLK_DEV
-	depends on ARCH_HAS_MMIO_FLUSH
+	depends on ARCH_HAS_PMEM_API
 	select LIBNVDIMM
 	help
 	  Infrastructure to probe ACPI 6 compliant platforms for
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 03105648f9b1..c20124a6eb49 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1964,7 +1964,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 			memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
 		else {
 			if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
-				mmio_flush_range((void __force *)
+				arch_invalidate_pmem((void __force *)
 					mmio->addr.aperture + offset, c);
 
 			memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 5434321cad67..c5917f040fa7 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,20 +5,6 @@
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
 
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-void arch_wb_cache_pmem(void *addr, size_t size);
-void arch_invalidate_pmem(void *addr, size_t size);
-#else
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-}
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-}
-#endif
-
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
 	/* One contiguous memory region per device */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 9b8d81a7b80e..3eaad2fbf284 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -174,4 +174,19 @@ u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+void arch_wb_cache_pmem(void *addr, size_t size);
+void arch_invalidate_pmem(void *addr, size_t size);
+#else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+}
+static inline void arch_invalidate_pmem(void *addr, size_t size)
+{
+}
+#endif
+
 #endif /* __LIBNVDIMM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..527da69e3be1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -559,9 +559,6 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_UACCESS_FLUSHCACHE
 	bool
 
-config ARCH_HAS_MMIO_FLUSH
-	bool
-
 config STACKDEPOT
 	bool
 	select STACKTRACE
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4c2fa98ef39d..d20791c3f499 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1546,8 +1546,8 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
 	else {
 		memcpy(iobuf, mmio->addr.base + dpa, len);
 
-		/* give us some some coverage of the mmio_flush_range() API */
-		mmio_flush_range(mmio->addr.base + dpa, len);
+		/* give us some some coverage of the arch_invalidate_pmem() API */
+		arch_invalidate_pmem(mmio->addr.base + dpa, len);
 	}
 	nd_region_release_lane(nd_region, lane);
 
-- 
cgit v1.2.3-59-g8ed1b


From 58738c495e15badd2015e19ff41f1f1ed55200bc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 31 Aug 2017 15:41:55 -0700
Subject: libnvdimm: fix integer overflow static analysis warning

Dan reports:
    The patch 62232e45f4a2: "libnvdimm: control (ioctl) messages for
    nvdimm_bus and nvdimm devices" from Jun 8, 2015, leads to the
    following static checker warning:

            drivers/nvdimm/bus.c:1018 __nd_ioctl()
            warn: integer overflows 'buf_len'

    From a casual review, this seems like it might be a real bug.  On
    the first iteration we load some data into in_env[].  On the second
    iteration we read a use controlled "in_size" from nd_cmd_in_size().
    It can go up to UINT_MAX - 1.  A high number means we will fill the
    whole in_env[] buffer.  But we potentially keep looping and adding
    more to in_len so now it can be any value.

    It simple enough to change, but it feels weird that we keep looping
    even though in_env is totally full.  Shouldn't we just return an
    error if we don't have space for desc->in_num.

We keep looping because the size of the total input is allowed to be
bigger than the 'envelope' which is a subset of the payload that tells
us how much data to expect. For safety explicitly check that buf_len
does not overflow which is what the checker flagged.

Cc: <stable@vger.kernel.org>
Fixes: 62232e45f4a2: "libnvdimm: control (ioctl) messages for nvdimm_bus..."
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index a18c2914f4b6..66586ce23f1b 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -911,19 +911,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		int read_only, unsigned int ioctl_cmd, unsigned long arg)
 {
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
-	size_t buf_len = 0, in_len = 0, out_len = 0;
 	static char out_env[ND_CMD_MAX_ENVELOPE];
 	static char in_env[ND_CMD_MAX_ENVELOPE];
 	const struct nd_cmd_desc *desc = NULL;
 	unsigned int cmd = _IOC_NR(ioctl_cmd);
-	unsigned int func = cmd;
-	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
-	struct nd_cmd_pkg pkg;
+	void __user *p = (void __user *) arg;
 	const char *cmd_name, *dimm_name;
+	u32 in_len = 0, out_len = 0;
+	unsigned int func = cmd;
 	unsigned long cmd_mask;
-	void *buf;
+	struct nd_cmd_pkg pkg;
 	int rc, i, cmd_rc;
+	u64 buf_len = 0;
+	void *buf;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
@@ -983,7 +984,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 	if (cmd == ND_CMD_CALL) {
 		func = pkg.nd_command;
-		dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
+		dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
 				__func__, dimm_name, pkg.nd_command,
 				in_len, out_len, buf_len);
 
@@ -1013,9 +1014,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		out_len += out_size;
 	}
 
-	buf_len = out_len + in_len;
+	buf_len = (u64) out_len + (u64) in_len;
 	if (buf_len > ND_IOCTL_MAX_BUFLEN) {
-		dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
+		dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__,
 				dimm_name, cmd_name, buf_len,
 				ND_IOCTL_MAX_BUFLEN);
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 26f2f4de0bd93cbe891e11307d9e4906253bfda8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 3 Sep 2017 10:17:53 -0700
Subject: dax: fix FS_DAX=n BLOCK=y compilation

The 0day kbuild robot reports:

>> drivers//dax/super.c:64:20: error: redefinition of 'fs_dax_get_by_bdev'
    struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
                       ^~~~~~~~~~~~~~~~~~
   In file included from drivers//dax/super.c:22:0:
   include/linux/dax.h:76:34: note: previous definition of 'fs_dax_get_by_bdev' was here
    static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
                                     ^~~~~~~~~~~~~~~~~~

Protect the definition of fs_dax_get_by_bdev() in drivers/dax/super.c
with an ifdef.

Fixes: 78f354735081 ("dax: introduce a fs_dax_get_by_bdev() helper")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index b699aac268a6..3600ff786646 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -61,6 +61,7 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 }
 EXPORT_SYMBOL(bdev_dax_pgoff);
 
+#if IS_ENABLED(CONFIG_FS_DAX)
 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 {
 	if (!blk_queue_dax(bdev->bd_queue))
@@ -68,6 +69,7 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 	return fs_dax_get_by_host(bdev->bd_disk->disk_name);
 }
 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+#endif
 
 /**
  * __bdev_dax_supported() - Check if the device supports dax for filesystem
-- 
cgit v1.2.3-59-g8ed1b


From 9edcad53d673fb033c2da7c6c05d30737739fdf5 Mon Sep 17 00:00:00 2001
From: Meng Xu <mengxu.gatech@gmail.com>
Date: Mon, 4 Sep 2017 11:34:33 -0400
Subject: libnvdimm, nfit: move the check on nd_reserved2 to the endpoint

Delay the check of nd_reserved2 to the actual endpoint (acpi_nfit_ctl)
that uses it, as a prevention of a potential double-fetch bug.

While examining the kernel source code, I found a dangerous operation that
could turn into a double-fetch situation (a race condition bug) where
the same userspace memory region are fetched twice into kernel with sanity
checks after the first fetch while missing checks after the second fetch.

In the case of _IOC_NR(ioctl_cmd) == ND_CMD_CALL:

1. The first fetch happens in line 935 copy_from_user(&pkg, p, sizeof(pkg)

2. subsequently `pkg.nd_reserved2` is asserted to be all zeroes
(line 984 to 986).

3. The second fetch happens in line 1022 copy_from_user(buf, p, buf_len)

4. Given that `p` can be fully controlled in userspace, an attacker can
race condition to override the header part of `p`, say,
`((struct nd_cmd_pkg *)p)->nd_reserved2` to arbitrary value
(say nine 0xFFFFFFFF for `nd_reserved2`) after the first fetch but before the
second fetch. The changed value will be copied to `buf`.

5. There is no checks on the second fetches until the use of it in
line 1034: nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf) and
line 1038: nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc)
which means that the assumed relation, `p->nd_reserved2` are all zeroes might
not hold after the second fetch. And once the control goes to these functions
we lose the context to assert the assumed relation.

6. Based on my manual analysis, `p->nd_reserved2` is not used in function
`nd_cmd_clear_to_send` and potential implementations of `nd_desc->ndctl`
so there is no working exploit against it right now. However, this could
easily turns to an exploitable one if careless developers start to use
`p->nd_reserved2` later and assume that they are all zeroes.

Move the validation of the nd_reserved2 field to the ->ndctl()
implementation where it has a stable buffer to evaluate.

Signed-off-by: Meng Xu <mengxu.gatech@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 4 ++++
 drivers/nvdimm/bus.c     | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c20124a6eb49..42221e785c47 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -228,6 +228,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 	if (cmd == ND_CMD_CALL) {
 		call_pkg = buf;
 		func = call_pkg->nd_command;
+
+		for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
+			if (call_pkg->nd_reserved2[i])
+				return -EINVAL;
 	}
 
 	if (nvdimm) {
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 66586ce23f1b..baf283986a7e 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -987,10 +987,6 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
 				__func__, dimm_name, pkg.nd_command,
 				in_len, out_len, buf_len);
-
-		for (i = 0; i < ARRAY_SIZE(pkg.nd_reserved2); i++)
-			if (pkg.nd_reserved2[i])
-				return -EINVAL;
 	}
 
 	/* process an output envelope */
-- 
cgit v1.2.3-59-g8ed1b


From 86652d2eb347080a991968c9d68708dc010ac56c Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Tue, 5 Sep 2017 14:35:39 -0600
Subject: libnvdimm, btt: clean up warning and error messages

Convert all WARN* style messages to dev_WARN, and for errors in the IO
paths, use dev_err_ratelimited. Also remove some BUG_ONs in the IO path
and replace them with the above - no need to crash the machine in case
of an unaligned IO.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 58 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index dabb84f7ab8a..130193a9cd8c 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -31,6 +31,11 @@ enum log_ent_request {
 	LOG_OLD_ENT
 };
 
+static struct device *to_dev(struct arena_info *arena)
+{
+	return &arena->nd_btt->dev;
+}
+
 static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
 {
 	return offset + nd_btt->initial_offset;
@@ -67,8 +72,10 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
 	 * We rely on that to make sure rw_bytes does error clearing
 	 * correctly, so make sure that is the case.
 	 */
-	WARN_ON_ONCE(!IS_ALIGNED(arena->infooff, 512));
-	WARN_ON_ONCE(!IS_ALIGNED(arena->info2off, 512));
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
+		"arena->infooff: %#llx is unaligned\n", arena->infooff);
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
+		"arena->info2off: %#llx is unaligned\n", arena->info2off);
 
 	ret = arena_write_bytes(arena, arena->info2off, super,
 			sizeof(struct btt_sb), 0);
@@ -81,7 +88,6 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
 
 static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
 {
-	WARN_ON(!super);
 	return arena_read_bytes(arena, arena->infooff, super,
 			sizeof(struct btt_sb), 0);
 }
@@ -97,7 +103,10 @@ static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
 {
 	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 
-	WARN_ON(lba >= arena->external_nlba);
+	if (unlikely(lba >= arena->external_nlba))
+		dev_err_ratelimited(to_dev(arena),
+			"%s: lba %#x out of range (max: %#x)\n",
+			__func__, lba, arena->external_nlba);
 	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
 }
 
@@ -136,7 +145,8 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 		 * construed as a valid 'normal' case, but we decide not to,
 		 * to avoid confusion
 		 */
-		WARN_ONCE(1, "Invalid use of Z and E flags\n");
+		dev_err_ratelimited(to_dev(arena),
+			"Invalid use of Z and E flags\n");
 		return -EIO;
 	}
 
@@ -152,7 +162,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 	u32 raw_mapping, postmap, ze, z_flag, e_flag;
 	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 
-	WARN_ON(lba >= arena->external_nlba);
+	if (unlikely(lba >= arena->external_nlba))
+		dev_err_ratelimited(to_dev(arena),
+			"%s: lba %#x out of range (max: %#x)\n",
+			__func__, lba, arena->external_nlba);
 
 	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
 	if (ret)
@@ -200,7 +213,6 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 static int btt_log_read_pair(struct arena_info *arena, u32 lane,
 			struct log_entry *ent)
 {
-	WARN_ON(!ent);
 	return arena_read_bytes(arena,
 			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
 			2 * LOG_ENT_SIZE, 0);
@@ -304,11 +316,6 @@ static int btt_log_get_old(struct log_entry *ent)
 	return old;
 }
 
-static struct device *to_dev(struct arena_info *arena)
-{
-	return &arena->nd_btt->dev;
-}
-
 /*
  * This function copies the desired (old/new) log entry into ent if
  * it is not NULL. It returns the sub-slot number (0 or 1)
@@ -414,12 +421,14 @@ static int btt_map_init(struct arena_info *arena)
 	 * make sure rw_bytes does error clearing correctly, so make sure that
 	 * is the case.
 	 */
-	WARN_ON_ONCE(!IS_ALIGNED(arena->mapoff, 512));
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
+		"arena->mapoff: %#llx is unaligned\n", arena->mapoff);
 
 	while (mapsize) {
 		size_t size = min(mapsize, chunk_size);
 
-		WARN_ON_ONCE(size < 512);
+		dev_WARN_ONCE(to_dev(arena), size < 512,
+			"chunk size: %#lx is unaligned\n", size);
 		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
 				size, 0);
 		if (ret)
@@ -456,12 +465,14 @@ static int btt_log_init(struct arena_info *arena)
 	 * make sure rw_bytes does error clearing correctly, so make sure that
 	 * is the case.
 	 */
-	WARN_ON_ONCE(!IS_ALIGNED(arena->logoff, 512));
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
+		"arena->logoff: %#llx is unaligned\n", arena->logoff);
 
 	while (logsize) {
 		size_t size = min(logsize, chunk_size);
 
-		WARN_ON_ONCE(size < 512);
+		dev_WARN_ONCE(to_dev(arena), size < 512,
+			"chunk size: %#lx is unaligned\n", size);
 		ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
 				size, 0);
 		if (ret)
@@ -553,7 +564,8 @@ static int btt_freelist_init(struct arena_info *arena)
 		if (ent_e_flag(log_new.old_map)) {
 			ret = arena_clear_freelist_error(arena, i);
 			if (ret)
-				WARN_ONCE(1, "Unable to clear known errors\n");
+				dev_err_ratelimited(to_dev(arena),
+					"Unable to clear known errors\n");
 		}
 
 		/* This implies a newly created or untouched flog entry */
@@ -1309,11 +1321,13 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 
-		BUG_ON(len > PAGE_SIZE);
-		/* Make sure len is in multiples of sector size. */
-		/* XXX is this right? */
-		BUG_ON(len < btt->sector_size);
-		BUG_ON(len % btt->sector_size);
+		if (len > PAGE_SIZE || len < btt->sector_size ||
+				len % btt->sector_size) {
+			dev_err_ratelimited(&btt->nd_btt->dev,
+				"unaligned bio segment (len: %d)\n", len);
+			bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
 
 		err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
 				  op_is_write(bio_op(bio)), iter.bi_sector);
-- 
cgit v1.2.3-59-g8ed1b


From 04c3c982fcc0151ed3574d7ae4f1e62278054d72 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 8 Sep 2017 09:36:57 -0700
Subject: libnvdimm, btt: fix format string warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix format warnings (seen on i386) in nvdimm/btt.c:

../drivers/nvdimm/btt.c: In function ‘btt_map_init’:
../drivers/nvdimm/btt.c:430:3: warning: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 4 has type ‘size_t’ [-Wformat=]
   dev_WARN_ONCE(to_dev(arena), size < 512,
   ^
../drivers/nvdimm/btt.c: In function ‘btt_log_init’:
../drivers/nvdimm/btt.c:474:3: warning: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 4 has type ‘size_t’ [-Wformat=]
   dev_WARN_ONCE(to_dev(arena), size < 512,
   ^

Fixes: 86652d2eb347 ("libnvdimm, btt: clean up warning and error messages")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 130193a9cd8c..b9008c3f0d17 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -428,7 +428,7 @@ static int btt_map_init(struct arena_info *arena)
 		size_t size = min(mapsize, chunk_size);
 
 		dev_WARN_ONCE(to_dev(arena), size < 512,
-			"chunk size: %#lx is unaligned\n", size);
+			"chunk size: %#zx is unaligned\n", size);
 		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
 				size, 0);
 		if (ret)
@@ -472,7 +472,7 @@ static int btt_log_init(struct arena_info *arena)
 		size_t size = min(logsize, chunk_size);
 
 		dev_WARN_ONCE(to_dev(arena), size < 512,
-			"chunk size: %#lx is unaligned\n", size);
+			"chunk size: %#zx is unaligned\n", size);
 		ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
 				size, 0);
 		if (ret)
-- 
cgit v1.2.3-59-g8ed1b