From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 15:02:11 -0400 Subject: libnvdimm: support for legacy (non-aliasing) nvdimms The libnvdimm region driver is an intermediary driver that translates non-volatile "region"s into "namespace" sub-devices that are surfaced by persistent memory block-device drivers (PMEM and BLK). ACPI 6 introduces the concept that a given nvdimm may simultaneously offer multiple access modes to its media through direct PMEM load/store access, or windowed BLK mode. Existing nvdimms mostly implement a PMEM interface, some offer a BLK-like mode, but never both as ACPI 6 defines. If an nvdimm is single interfaced, then there is no need for dimm metadata labels. For these devices we can take the region boundaries directly to create a child namespace device (nd_namespace_io). Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 111 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 drivers/nvdimm/namespace_devs.c (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c new file mode 100644 index 000000000000..4f653d1e61ad --- /dev/null +++ b/drivers/nvdimm/namespace_devs.c @@ -0,0 +1,111 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include "nd.h" + +static void namespace_io_release(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + kfree(nsio); +} + +static struct device_type namespace_io_device_type = { + .name = "nd_namespace_io", + .release = namespace_io_release, +}; + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static struct attribute *nd_namespace_attributes[] = { + &dev_attr_nstype.attr, + NULL, +}; + +static struct attribute_group nd_namespace_attribute_group = { + .attrs = nd_namespace_attributes, +}; + +static const struct attribute_group *nd_namespace_attribute_groups[] = { + &nd_device_attribute_group, + &nd_namespace_attribute_group, + NULL, +}; + +static struct device **create_namespace_io(struct nd_region *nd_region) +{ + struct nd_namespace_io *nsio; + struct device *dev, **devs; + struct resource *res; + + nsio = kzalloc(sizeof(*nsio), GFP_KERNEL); + if (!nsio) + return NULL; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) { + kfree(nsio); + return NULL; + } + + dev = &nsio->dev; + dev->type = &namespace_io_device_type; + dev->parent = &nd_region->dev; + res = &nsio->res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + res->start = nd_region->ndr_start; + res->end = res->start + nd_region->ndr_size - 1; + + devs[0] = dev; + return devs; +} + +int nd_region_register_namespaces(struct nd_region *nd_region, int *err) +{ + struct device **devs = NULL; + int i; + + *err = 0; + switch (nd_region_to_nstype(nd_region)) { + case ND_DEVICE_NAMESPACE_IO: + devs = create_namespace_io(nd_region); + break; + default: + break; + } + + if (!devs) + return -ENODEV; + + for (i = 0; devs[i]; i++) { + struct device *dev = devs[i]; + + dev_set_name(dev, "namespace%d.%d", nd_region->id, i); + dev->groups = nd_namespace_attribute_groups; + nd_device_register(dev); + } + kfree(devs); + + return i; +} -- cgit v1.2.3-59-g8ed1b From bf9bccc14c05dae8caba29df6187c731710f5380 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jun 2015 17:14:46 -0400 Subject: libnvdimm: pmem label sets and namespace instantiation. A complete label set is a PMEM-label per-dimm per-interleave-set where all the UUIDs match and the interleave set cookie matches the hosting interleave set. Present sysfs attributes for manipulation of a PMEM-namespace's 'alt_name', 'uuid', and 'size' attributes. A later patch will make these settings persistent by writing back the label. Note that PMEM allocations grow forwards from the start of an interleave set (lowest dimm-physical-address (DPA)). BLK-namespaces that alias with a PMEM interleave set will grow allocations backward from the highest DPA. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 8 +- drivers/nvdimm/core.c | 64 +++ drivers/nvdimm/dimm.c | 21 +- drivers/nvdimm/dimm_devs.c | 137 ++++++ drivers/nvdimm/label.c | 55 ++- drivers/nvdimm/label.h | 2 + drivers/nvdimm/namespace_devs.c | 1002 ++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/nd-core.h | 12 + drivers/nvdimm/nd.h | 17 + drivers/nvdimm/pmem.c | 20 +- drivers/nvdimm/region.c | 3 + drivers/nvdimm/region_devs.c | 158 +++++- include/linux/libnvdimm.h | 10 + include/linux/nd.h | 24 + include/uapi/linux/ndctl.h | 4 + 15 files changed, 1506 insertions(+), 31 deletions(-) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index ffb43cada625..fddc3f2a8f80 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -97,6 +97,8 @@ static int nvdimm_bus_probe(struct device *dev) rc = nd_drv->probe(dev); if (rc == 0) nd_region_probe_success(nvdimm_bus, dev); + else + nd_region_disable(nvdimm_bus, dev); nvdimm_bus_probe_end(nvdimm_bus); dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, @@ -381,8 +383,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, } EXPORT_SYMBOL_GPL(nd_cmd_out_size); -static void wait_nvdimm_bus_probe_idle(struct nvdimm_bus *nvdimm_bus) +void wait_nvdimm_bus_probe_idle(struct device *dev) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + do { if (nvdimm_bus->probe_active == 0) break; @@ -402,7 +406,7 @@ static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd) return 0; nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); - wait_nvdimm_bus_probe_idle(nvdimm_bus); + wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev); if (atomic_read(&nvdimm->busy)) return -EBUSY; diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 7806eaaf4707..cf99cce8ef33 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +110,69 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) return NULL; } +static bool is_uuid_sep(char sep) +{ + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') + return true; + return false; +} + +static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf, + size_t len) +{ + const char *str = buf; + u8 uuid[16]; + int i; + + for (i = 0; i < 16; i++) { + if (!isxdigit(str[0]) || !isxdigit(str[1])) { + dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n", + __func__, i, str - buf, str[0], + str + 1 - buf, str[1]); + return -EINVAL; + } + + uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); + str += 2; + if (is_uuid_sep(*str)) + str++; + } + + memcpy(uuid_out, uuid, sizeof(uuid)); + return 0; +} + +/** + * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes + * @dev: container device for the uuid property + * @uuid_out: uuid buffer to replace + * @buf: raw sysfs buffer to parse + * + * Enforce that uuids can only be changed while the device is disabled + * (driver detached) + * LOCKING: expects device_lock() is held on entry + */ +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len) +{ + u8 uuid[16]; + int rc; + + if (dev->driver) + return -EBUSY; + + rc = nd_uuid_parse(dev, uuid, buf, len); + if (rc) + return rc; + + kfree(*uuid_out); + *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL); + if (!(*uuid_out)) + return -ENOMEM; + + return 0; +} + static ssize_t commands_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 2df97c3c3b34..71d12bb67339 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -21,18 +21,6 @@ #include "label.h" #include "nd.h" -static void free_data(struct nvdimm_drvdata *ndd) -{ - if (!ndd) - return; - - if (ndd->data && is_vmalloc_addr(ndd->data)) - vfree(ndd->data); - else - kfree(ndd->data); - kfree(ndd); -} - static int nvdimm_probe(struct device *dev) { struct nvdimm_drvdata *ndd; @@ -49,6 +37,8 @@ static int nvdimm_probe(struct device *dev) ndd->dpa.start = 0; ndd->dpa.end = -1; ndd->dev = dev; + get_device(dev); + kref_init(&ndd->kref); rc = nvdimm_init_nsarea(ndd); if (rc) @@ -74,21 +64,18 @@ static int nvdimm_probe(struct device *dev) return 0; err: - free_data(ndd); + put_ndd(ndd); return rc; } static int nvdimm_remove(struct device *dev) { struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); - struct resource *res, *_r; nvdimm_bus_lock(dev); dev_set_drvdata(dev, NULL); - for_each_dpa_resource_safe(ndd, res, _r) - nvdimm_free_dpa(ndd, res); nvdimm_bus_unlock(dev); - free_data(ndd); + put_ndd(ndd); return 0; } diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index d2ef02e4be6c..b55acef179ba 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -159,6 +159,48 @@ struct nvdimm *to_nvdimm(struct device *dev) } EXPORT_SYMBOL_GPL(to_nvdimm); +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev)); + + return dev_get_drvdata(&nvdimm->dev); +} +EXPORT_SYMBOL(to_ndd); + +void nvdimm_drvdata_release(struct kref *kref) +{ + struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref); + struct device *dev = ndd->dev; + struct resource *res, *_r; + + dev_dbg(dev, "%s\n", __func__); + + nvdimm_bus_lock(dev); + for_each_dpa_resource_safe(ndd, res, _r) + nvdimm_free_dpa(ndd, res); + nvdimm_bus_unlock(dev); + + if (ndd->data && is_vmalloc_addr(ndd->data)) + vfree(ndd->data); + else + kfree(ndd->data); + kfree(ndd); + put_device(dev); +} + +void get_ndd(struct nvdimm_drvdata *ndd) +{ + kref_get(&ndd->kref); +} + +void put_ndd(struct nvdimm_drvdata *ndd) +{ + if (ndd) + kref_put(&ndd->kref, nvdimm_drvdata_release); +} + const char *nvdimm_name(struct nvdimm *nvdimm) { return dev_name(&nvdimm->dev); @@ -247,6 +289,83 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); +/** + * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa + * @nd_mapping: container of dpa-resource-root + labels + * @nd_region: constrain available space check to this reference region + * @overlap: calculate available space assuming this level of overlap + * + * Validate that a PMEM label, if present, aligns with the start of an + * interleave set and truncate the available size at the lowest BLK + * overlap point. + * + * The expectation is that this routine is called multiple times as it + * probes for the largest BLK encroachment for any single member DIMM of + * the interleave set. Once that value is determined the PMEM-limit for + * the set can be established. + */ +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap) +{ + resource_size_t map_start, map_end, busy = 0, available, blk_start; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + const char *reason; + + if (!ndd) + return 0; + + map_start = nd_mapping->start; + map_end = map_start + nd_mapping->size - 1; + blk_start = max(map_start, map_end + 1 - *overlap); + for_each_dpa_resource(ndd, res) + if (res->start >= map_start && res->start < map_end) { + if (strncmp(res->name, "blk", 3) == 0) + blk_start = min(blk_start, res->start); + else if (res->start != map_start) { + reason = "misaligned to iset"; + goto err; + } else { + if (busy) { + reason = "duplicate overlapping PMEM reservations?"; + goto err; + } + busy += resource_size(res); + continue; + } + } else if (res->end >= map_start && res->end <= map_end) { + if (strncmp(res->name, "blk", 3) == 0) { + /* + * If a BLK allocation overlaps the start of + * PMEM the entire interleave set may now only + * be used for BLK. + */ + blk_start = map_start; + } else { + reason = "misaligned to iset"; + goto err; + } + } else if (map_start > res->start && map_start < res->end) { + /* total eclipse of the mapping */ + busy += nd_mapping->size; + blk_start = map_start; + } + + *overlap = map_end + 1 - blk_start; + available = blk_start - map_start; + if (busy < available) + return available - busy; + return 0; + + err: + /* + * Something is wrong, PMEM must align with the start of the + * interleave set, and there can only be one allocation per set. + */ + nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); + return 0; +} + void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res) { WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); @@ -271,6 +390,24 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, return res; } +/** + * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id + * @nvdimm: container of dpa-resource-root + labels + * @label_id: dpa resource name of the form {pmem|blk}- + */ +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id) +{ + resource_size_t allocated = 0; + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + allocated += resource_size(res); + + return allocated; +} + static int count_dimms(struct device *dev, void *c) { int *count = c; diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index db5d7492dc8d..1a3bcd27a57a 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -230,7 +230,7 @@ static bool preamble_current(struct nvdimm_drvdata *ndd, return true; } -static char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags) +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags) { if (!label_id || !uuid) return NULL; @@ -288,3 +288,56 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd) return 0; } + +int nd_label_active_count(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + int count = 0; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + + if (!slot_valid(nd_label, slot)) { + u32 label_slot = __le32_to_cpu(nd_label->slot); + u64 size = __le64_to_cpu(nd_label->rawsize); + u64 dpa = __le64_to_cpu(nd_label->dpa); + + dev_dbg(ndd->dev, + "%s: slot%d invalid slot: %d dpa: %llx size: %llx\n", + __func__, slot, label_slot, dpa, size); + continue; + } + count++; + } + return count; +} + +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return NULL; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + if (!slot_valid(nd_label, slot)) + continue; + + if (n-- == 0) + return nd_label_base(ndd) + slot; + } + + return NULL; +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index d6aa0d5c6b4e..8ee1376526c7 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -125,4 +125,6 @@ int nd_label_validate(struct nvdimm_drvdata *ndd); void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, struct nd_namespace_index *src); size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); +int nd_label_active_count(struct nvdimm_drvdata *ndd); +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); #endif /* __LABEL_H__ */ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 4f653d1e61ad..5d81032fcfc5 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -14,6 +14,7 @@ #include #include #include +#include "nd-core.h" #include "nd.h" static void namespace_io_release(struct device *dev) @@ -23,11 +24,50 @@ static void namespace_io_release(struct device *dev) kfree(nsio); } +static void namespace_pmem_release(struct device *dev) +{ + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + kfree(nspm->alt_name); + kfree(nspm->uuid); + kfree(nspm); +} + +static void namespace_blk_release(struct device *dev) +{ + /* TODO: blk namespace support */ +} + static struct device_type namespace_io_device_type = { .name = "nd_namespace_io", .release = namespace_io_release, }; +static struct device_type namespace_pmem_device_type = { + .name = "nd_namespace_pmem", + .release = namespace_pmem_release, +}; + +static struct device_type namespace_blk_device_type = { + .name = "nd_namespace_blk", + .release = namespace_blk_release, +}; + +static bool is_namespace_pmem(struct device *dev) +{ + return dev ? dev->type == &namespace_pmem_device_type : false; +} + +static bool is_namespace_blk(struct device *dev) +{ + return dev ? dev->type == &namespace_blk_device_type : false; +} + +static bool is_namespace_io(struct device *dev) +{ + return dev ? dev->type == &namespace_io_device_type : false; +} + static ssize_t nstype_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -37,13 +77,676 @@ static ssize_t nstype_show(struct device *dev, } static DEVICE_ATTR_RO(nstype); +static ssize_t __alt_name_store(struct device *dev, const char *buf, + const size_t len) +{ + char *input, *pos, *alt_name, **ns_altname; + ssize_t rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = &nspm->alt_name; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } else + return -ENXIO; + + if (dev->driver) + return -EBUSY; + + input = kmemdup(buf, len + 1, GFP_KERNEL); + if (!input) + return -ENOMEM; + + input[len] = '\0'; + pos = strim(input); + if (strlen(pos) + 1 > NSLABEL_NAME_LEN) { + rc = -EINVAL; + goto out; + } + + alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL); + if (!alt_name) { + rc = -ENOMEM; + goto out; + } + kfree(*ns_altname); + *ns_altname = alt_name; + sprintf(*ns_altname, "%s", pos); + rc = len; + +out: + kfree(input); + return rc; +} + +static ssize_t alt_name_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __alt_name_store(dev, buf, len); + dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} + +static ssize_t alt_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char *ns_altname; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = nspm->alt_name; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } else + return -ENXIO; + + return sprintf(buf, "%s\n", ns_altname ? ns_altname : ""); +} +static DEVICE_ATTR_RW(alt_name); + +static int scan_free(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int rc = 0; + + while (n) { + struct resource *res, *last; + resource_size_t new_start; + + last = NULL; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + last = res; + res = last; + if (!res) + return 0; + + if (n >= resource_size(res)) { + n -= resource_size(res); + nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc); + nvdimm_free_dpa(ndd, res); + /* retry with last resource deleted */ + continue; + } + + /* + * Keep BLK allocations relegated to high DPA as much as + * possible + */ + if (is_blk) + new_start = res->start + n; + else + new_start = res->start; + + rc = adjust_resource(res, new_start, resource_size(res) - n); + nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc); + break; + } + + return rc; +} + +/** + * shrink_dpa_allocation - for each dimm in region free n bytes for label_id + * @nd_region: the set of dimms to reclaim @n bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to release + * + * Assumes resources are ordered. Starting from the end try to + * adjust_resource() the allocation to @n, but if @n is larger than the + * allocation delete it and find the 'new' last allocation in the label + * set. + */ +static int shrink_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + rc = scan_free(nd_region, nd_mapping, label_id, n); + if (rc) + return rc; + } + + return 0; +} + +static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, + struct nd_region *nd_region, struct nd_mapping *nd_mapping, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t first_dpa; + struct resource *res; + int rc = 0; + + /* allocate blk from highest dpa first */ + if (is_blk) + first_dpa = nd_mapping->start + nd_mapping->size - n; + else + first_dpa = nd_mapping->start; + + /* first resource allocation for this label-id or dimm */ + res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n); + if (!res) + rc = -EBUSY; + + nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc); + return rc ? n : 0; +} + +static bool space_valid(bool is_pmem, struct nd_label_id *label_id, + struct resource *res) +{ + /* + * For BLK-space any space is valid, for PMEM-space, it must be + * contiguous with an existing allocation. + */ + if (!is_pmem) + return true; + if (!res || strcmp(res->name, label_id->id) == 0) + return true; + return false; +} + +enum alloc_loc { + ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER, +}; + +static resource_size_t scan_allocate(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + const resource_size_t to_allocate = n; + struct resource *res; + int first; + + retry: + first = 0; + for_each_dpa_resource(ndd, res) { + resource_size_t allocate, available = 0, free_start, free_end; + struct resource *next = res->sibling, *new_res = NULL; + enum alloc_loc loc = ALLOC_ERR; + const char *action; + int rc = 0; + + /* ignore resources outside this nd_mapping */ + if (res->start > mapping_end) + continue; + if (res->end < nd_mapping->start) + continue; + + /* space at the beginning of the mapping */ + if (!first++ && res->start > nd_mapping->start) { + free_start = nd_mapping->start; + available = res->start - free_start; + if (space_valid(is_pmem, label_id, NULL)) + loc = ALLOC_BEFORE; + } + + /* space between allocations */ + if (!loc && next) { + free_start = res->start + resource_size(res); + free_end = min(mapping_end, next->start - 1); + if (space_valid(is_pmem, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_MID; + } + } + + /* space at the end of the mapping */ + if (!loc && !next) { + free_start = res->start + resource_size(res); + free_end = mapping_end; + if (space_valid(is_pmem, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_AFTER; + } + } + + if (!loc || !available) + continue; + allocate = min(available, n); + switch (loc) { + case ALLOC_BEFORE: + if (strcmp(res->name, label_id->id) == 0) { + /* adjust current resource up */ + if (is_pmem) + return n; + rc = adjust_resource(res, res->start - allocate, + resource_size(res) + allocate); + action = "cur grow up"; + } else + action = "allocate"; + break; + case ALLOC_MID: + if (strcmp(next->name, label_id->id) == 0) { + /* adjust next resource up */ + if (is_pmem) + return n; + rc = adjust_resource(next, next->start + - allocate, resource_size(next) + + allocate); + new_res = next; + action = "next grow up"; + } else if (strcmp(res->name, label_id->id) == 0) { + action = "grow down"; + } else + action = "allocate"; + break; + case ALLOC_AFTER: + if (strcmp(res->name, label_id->id) == 0) + action = "grow down"; + else + action = "allocate"; + break; + default: + return n; + } + + if (strcmp(action, "allocate") == 0) { + /* BLK allocate bottom up */ + if (!is_pmem) + free_start += available - allocate; + else if (free_start != nd_mapping->start) + return n; + + new_res = nvdimm_allocate_dpa(ndd, label_id, + free_start, allocate); + if (!new_res) + rc = -EBUSY; + } else if (strcmp(action, "grow down") == 0) { + /* adjust current resource down */ + rc = adjust_resource(res, res->start, resource_size(res) + + allocate); + } + + if (!new_res) + new_res = res; + + nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n", + action, loc, rc); + + if (rc) + return n; + + n -= allocate; + if (n) { + /* + * Retry scan with newly inserted resources. + * For example, if we did an ALLOC_BEFORE + * insertion there may also have been space + * available for an ALLOC_AFTER insertion, so we + * need to check this same resource again + */ + goto retry; + } else + return 0; + } + + if (is_pmem && n == to_allocate) + return init_dpa_allocation(label_id, nd_region, nd_mapping, n); + return n; +} + +/** + * grow_dpa_allocation - for each dimm allocate n bytes for @label_id + * @nd_region: the set of dimms to allocate @n more bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to add to the existing allocation + * + * Assumes resources are ordered. For BLK regions, first consume + * BLK-only available DPA free space, then consume PMEM-aliased DPA + * space starting at the highest DPA. For PMEM regions start + * allocations from the start of an interleave set and end at the first + * BLK allocation or the end of the interleave set, whichever comes + * first. + */ +static int grow_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + rc = scan_allocate(nd_region, nd_mapping, label_id, n); + if (rc) + return rc; + } + + return 0; +} + +static void nd_namespace_pmem_set_size(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + struct resource *res = &nspm->nsio.res; + + res->start = nd_region->ndr_start; + res->end = nd_region->ndr_start + size - 1; +} + +static ssize_t __size_store(struct device *dev, unsigned long long val) +{ + resource_size_t allocated = 0, available = 0; + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_mapping *nd_mapping; + struct nvdimm_drvdata *ndd; + struct nd_label_id label_id; + u32 flags = 0, remainder; + u8 *uuid = NULL; + int rc, i; + + if (dev->driver) + return -EBUSY; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } + + /* + * We need a uuid for the allocation-label and dimm(s) on which + * to store the label. + */ + if (!uuid || nd_region->ndr_mappings == 0) + return -ENXIO; + + div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); + if (remainder) { + dev_dbg(dev, "%llu is not %dK aligned\n", val, + (SZ_4K * nd_region->ndr_mappings) / SZ_1K); + return -EINVAL; + } + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + ndd = to_ndd(nd_mapping); + + /* + * All dimms in an interleave set, or the base dimm for a blk + * region, need to be enabled for the size to be changed. + */ + if (!ndd) + return -ENXIO; + + allocated += nvdimm_allocated_dpa(ndd, &label_id); + } + available = nd_region_available_dpa(nd_region); + + if (val > available + allocated) + return -ENOSPC; + + if (val == allocated) + return 0; + + val = div_u64(val, nd_region->ndr_mappings); + allocated = div_u64(allocated, nd_region->ndr_mappings); + if (val < allocated) + rc = shrink_dpa_allocation(nd_region, &label_id, + allocated - val); + else + rc = grow_dpa_allocation(nd_region, &label_id, val - allocated); + + if (rc) + return rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + nd_namespace_pmem_set_size(nd_region, nspm, + val * nd_region->ndr_mappings); + } + + return rc; +} + +static ssize_t size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + unsigned long long val; + u8 **uuid = NULL; + int rc; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __size_store(dev, val); + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + rc = -ENXIO; + } + + if (rc == 0 && val == 0 && uuid) { + /* setting size zero == 'delete namespace' */ + kfree(*uuid); + *uuid = NULL; + } + + dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0 + ? "fail" : "success", rc); + + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&nspm->nsio.res)); + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&nsio->res)); + } else + return -ENXIO; +} +static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } else + return -ENXIO; + + if (uuid) + return sprintf(buf, "%pUb\n", uuid); + return sprintf(buf, "\n"); +} + +/** + * namespace_update_uuid - check for a unique uuid and whether we're "renaming" + * @nd_region: parent region so we can updates all dimms in the set + * @dev: namespace type for generating label_id + * @new_uuid: incoming uuid + * @old_uuid: reference to the uuid storage location in the namespace object + */ +static int namespace_update_uuid(struct nd_region *nd_region, + struct device *dev, u8 *new_uuid, u8 **old_uuid) +{ + u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0; + struct nd_label_id old_label_id; + struct nd_label_id new_label_id; + int i, rc; + + rc = nd_is_uuid_unique(dev, new_uuid) ? 0 : -EINVAL; + if (rc) { + kfree(new_uuid); + return rc; + } + + if (*old_uuid == NULL) + goto out; + + nd_label_gen_id(&old_label_id, *old_uuid, flags); + nd_label_gen_id(&new_label_id, new_uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, old_label_id.id) == 0) + sprintf((void *) res->name, "%s", + new_label_id.id); + } + kfree(*old_uuid); + out: + *old_uuid = new_uuid; + return 0; +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + u8 *uuid = NULL; + u8 **ns_uuid; + ssize_t rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + /* TODO: blk namespace support */ + return -ENXIO; + } else + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = nd_uuid_store(dev, &uuid, buf, len); + if (rc >= 0) + rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct resource *res; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + res = &nspm->nsio.res; + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + res = &nsio->res; + } else + return -ENXIO; + + /* no address to convey if the namespace has no allocation */ + if (resource_size(res) == 0) + return -ENXIO; + return sprintf(buf, "%#llx\n", (unsigned long long) res->start); +} +static DEVICE_ATTR_RO(resource); + static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, + &dev_attr_size.attr, + &dev_attr_uuid.attr, + &dev_attr_resource.attr, + &dev_attr_alt_name.attr, NULL, }; +static umode_t namespace_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (a == &dev_attr_resource.attr) { + if (is_namespace_blk(dev)) + return 0; + return a->mode; + } + + if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { + if (a == &dev_attr_size.attr) + return S_IWUSR | S_IRUGO; + return a->mode; + } + + if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr) + return a->mode; + + return 0; +} + static struct attribute_group nd_namespace_attribute_group = { .attrs = nd_namespace_attributes, + .is_visible = namespace_visible, }; static const struct attribute_group *nd_namespace_attribute_groups[] = { @@ -81,23 +784,318 @@ static struct device **create_namespace_io(struct nd_region *nd_region) return devs; } +static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, + u64 cookie, u16 pos) +{ + struct nd_namespace_label *found = NULL; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + bool found_uuid = false; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + u16 position = __le16_to_cpu(nd_label->position); + u16 nlabel = __le16_to_cpu(nd_label->nlabel); + + if (isetcookie != cookie) + continue; + + if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + + if (found_uuid) { + dev_dbg(to_ndd(nd_mapping)->dev, + "%s duplicate entry for uuid\n", + __func__); + return false; + } + found_uuid = true; + if (nlabel != nd_region->ndr_mappings) + continue; + if (position != pos) + continue; + found = nd_label; + break; + } + if (found) + break; + } + return found != NULL; +} + +static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) +{ + struct nd_namespace_label *select = NULL; + int i; + + if (!pmem_id) + return -ENODEV; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + u64 hw_start, hw_end, pmem_start, pmem_end; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) + if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) + break; + + if (!nd_label) { + WARN_ON(1); + return -EINVAL; + } + + select = nd_label; + /* + * Check that this label is compliant with the dpa + * range published in NFIT + */ + hw_start = nd_mapping->start; + hw_end = hw_start + nd_mapping->size; + pmem_start = __le64_to_cpu(select->dpa); + pmem_end = pmem_start + __le64_to_cpu(select->rawsize); + if (pmem_start == hw_start && pmem_end <= hw_end) + /* pass */; + else + return -EINVAL; + + nd_mapping->labels[0] = select; + nd_mapping->labels[1] = NULL; + } + return 0; +} + +/** + * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * @nd_region: region with mappings to validate + */ +static int find_pmem_label_set(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region); + struct nd_namespace_label *nd_label; + u8 select_id[NSLABEL_UUID_LEN]; + resource_size_t size = 0; + u8 *pmem_id = NULL; + int rc = -ENODEV, l; + u16 i; + + if (cookie == 0) + return -ENXIO; + + /* + * Find a complete set of labels by uuid. By definition we can start + * with any mapping as the reference label + */ + for_each_label(l, nd_label, nd_region->mapping[0].labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + + if (isetcookie != cookie) + continue; + + for (i = 0; nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, + cookie, i)) + break; + if (i < nd_region->ndr_mappings) { + /* + * Give up if we don't find an instance of a + * uuid at each position (from 0 to + * nd_region->ndr_mappings - 1), or if we find a + * dimm with two instances of the same uuid. + */ + rc = -EINVAL; + goto err; + } else if (pmem_id) { + /* + * If there is more than one valid uuid set, we + * need userspace to clean this up. + */ + rc = -EBUSY; + goto err; + } + memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); + pmem_id = select_id; + } + + /* + * Fix up each mapping's 'labels' to have the validated pmem label for + * that position at labels[0], and NULL at labels[1]. In the process, + * check that the namespace aligns with interleave-set. We know + * that it does not overlap with any blk namespaces by virtue of + * the dimm being enabled (i.e. nd_label_reserve_dpa() + * succeeded). + */ + rc = select_pmem_id(nd_region, pmem_id); + if (rc) + goto err; + + /* Calculate total size and populate namespace properties from label0 */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *label0 = nd_mapping->labels[0]; + + size += __le64_to_cpu(label0->rawsize); + if (__le16_to_cpu(label0->position) != 0) + continue; + WARN_ON(nspm->alt_name || nspm->uuid); + nspm->alt_name = kmemdup((void __force *) label0->name, + NSLABEL_NAME_LEN, GFP_KERNEL); + nspm->uuid = kmemdup((void __force *) label0->uuid, + NSLABEL_UUID_LEN, GFP_KERNEL); + } + + if (!nspm->alt_name || !nspm->uuid) { + rc = -ENOMEM; + goto err; + } + + nd_namespace_pmem_set_size(nd_region, nspm, size); + + return 0; + err: + switch (rc) { + case -EINVAL: + dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); + break; + case -ENODEV: + dev_dbg(&nd_region->dev, "%s: label not found\n", __func__); + break; + default: + dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n", + __func__, rc); + break; + } + return rc; +} + +static struct device **create_namespace_pmem(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct device *dev, **devs; + struct resource *res; + int rc; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + rc = find_pmem_label_set(nd_region, nspm); + if (rc == -ENODEV) { + int i; + + /* Pass, try to permit namespace creation... */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + /* Publish a zero-sized namespace for userspace to configure. */ + nd_namespace_pmem_set_size(nd_region, nspm, 0); + + rc = 0; + } else if (rc) + goto err; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) + goto err; + + devs[0] = dev; + return devs; + + err: + namespace_pmem_release(&nspm->nsio.dev); + return NULL; +} + +static int init_active_labels(struct nd_region *nd_region) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int count, j; + + /* + * If the dimm is disabled then prevent the region from + * being activated if it aliases DPA. + */ + if (!ndd) { + if ((nvdimm->flags & NDD_ALIASING) == 0) + return 0; + dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", + dev_name(&nd_mapping->nvdimm->dev)); + return -ENXIO; + } + nd_mapping->ndd = ndd; + atomic_inc(&nvdimm->busy); + get_ndd(ndd); + + count = nd_label_active_count(ndd); + dev_dbg(ndd->dev, "%s: %d\n", __func__, count); + if (!count) + continue; + nd_mapping->labels = kcalloc(count + 1, sizeof(void *), + GFP_KERNEL); + if (!nd_mapping->labels) + return -ENOMEM; + for (j = 0; j < count; j++) { + struct nd_namespace_label *label; + + label = nd_label_active(ndd, j); + nd_mapping->labels[j] = label; + } + } + + return 0; +} + int nd_region_register_namespaces(struct nd_region *nd_region, int *err) { struct device **devs = NULL; - int i; + int i, rc = 0, type; *err = 0; - switch (nd_region_to_nstype(nd_region)) { + nvdimm_bus_lock(&nd_region->dev); + rc = init_active_labels(nd_region); + if (rc) { + nvdimm_bus_unlock(&nd_region->dev); + return rc; + } + + type = nd_region_to_nstype(nd_region); + switch (type) { case ND_DEVICE_NAMESPACE_IO: devs = create_namespace_io(nd_region); break; + case ND_DEVICE_NAMESPACE_PMEM: + devs = create_namespace_pmem(nd_region); + break; default: break; } + nvdimm_bus_unlock(&nd_region->dev); if (!devs) return -ENODEV; + nd_region->ns_seed = devs[0]; for (i = 0; devs[i]; i++) { struct device *dev = devs[i]; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 6a4b2c066ee7..c6c889292bab 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -56,4 +56,16 @@ int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus); int nd_match_dimm(struct device *dev, void *data); +struct nd_label_id; +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags); +bool nd_is_uuid_unique(struct device *dev, u8 *uuid); +struct nd_region; +struct nvdimm_drvdata; +struct nd_mapping; +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap); +resource_size_t nd_region_available_dpa(struct nd_region *nd_region); +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id); +void get_ndd(struct nvdimm_drvdata *ndd); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 401fa0d5b6ea..03e610cd9f43 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "label.h" struct nvdimm_drvdata { @@ -25,6 +26,7 @@ struct nvdimm_drvdata { void *data; int ns_current, ns_next; struct resource dpa; + struct kref kref; }; struct nd_region_namespaces { @@ -59,12 +61,19 @@ static inline struct nd_namespace_index *to_next_namespace_index( (unsigned long long) (res ? resource_size(res) : 0), \ (unsigned long long) (res ? res->start : 0), ##arg) +#define for_each_label(l, label, labels) \ + for (l = 0; (label = labels ? labels[l] : NULL); l++) + +#define for_each_dpa_resource(ndd, res) \ + for (res = (ndd)->dpa.child; res; res = res->sibling) + #define for_each_dpa_resource_safe(ndd, res, next) \ for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ res; res = next, next = next ? next->sibling : NULL) struct nd_region { struct device dev; + struct device *ns_seed; u16 ndr_mappings; u64 ndr_size; u64 ndr_start; @@ -88,20 +97,28 @@ enum nd_async_mode { ND_ASYNC, }; +void wait_nvdimm_bus_probe_idle(struct device *dev); void nd_device_register(struct device *dev); void nd_device_unregister(struct device *dev, enum nd_async_mode mode); +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len); int __init nvdimm_init(void); int __init nd_region_init(void); void nvdimm_exit(void); void nd_region_exit(void); +struct nvdimm; +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); struct nd_region *to_nd_region(struct device *dev); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); void nvdimm_bus_lock(struct device *dev); void nvdimm_bus_unlock(struct device *dev); bool is_nvdimm_bus_locked(struct device *dev); +void nvdimm_drvdata_release(struct kref *kref); +void put_ndd(struct nvdimm_drvdata *ndd); int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd); void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res); struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d46975ed9e40..90902a142e35 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -203,6 +203,23 @@ static int nd_pmem_probe(struct device *dev) struct nd_namespace_io *nsio = to_nd_namespace_io(dev); struct pmem_device *pmem; + if (resource_size(&nsio->res) < ND_MIN_NAMESPACE_SIZE) { + resource_size_t size = resource_size(&nsio->res); + + dev_dbg(dev, "%s: size: %pa, too small must be at least %#x\n", + __func__, &size, ND_MIN_NAMESPACE_SIZE); + return -ENODEV; + } + + if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_PMEM) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + if (!nspm->uuid) { + dev_dbg(dev, "%s: uuid not set\n", __func__); + return -ENODEV; + } + } + pmem = pmem_alloc(dev, &nsio->res, nd_region->id); if (IS_ERR(pmem)) return PTR_ERR(pmem); @@ -222,13 +239,14 @@ static int nd_pmem_remove(struct device *dev) MODULE_ALIAS("pmem"); MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); static struct nd_device_driver nd_pmem_driver = { .probe = nd_pmem_probe, .remove = nd_pmem_remove, .drv = { .name = "nd_pmem", }, - .type = ND_DRIVER_NAMESPACE_IO, + .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, }; static int __init pmem_init(void) diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index ade3dba81afd..9aba44e483e0 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -61,8 +61,11 @@ static int child_unregister(struct device *dev, void *data) static int nd_region_remove(struct device *dev) { + struct nd_region *nd_region = to_nd_region(dev); + /* flush attribute readers and disable */ nvdimm_bus_lock(dev); + nd_region->ns_seed = NULL; dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 1571424578f0..b45806f7176d 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "nd-core.h" #include "nd.h" @@ -99,6 +100,58 @@ int nd_region_to_nstype(struct nd_region *nd_region) return 0; } +EXPORT_SYMBOL(nd_region_to_nstype); + +static int is_uuid_busy(struct device *dev, void *data) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + u8 *uuid = data; + + switch (nd_region_to_nstype(nd_region)) { + case ND_DEVICE_NAMESPACE_PMEM: { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + if (!nspm->uuid) + break; + if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + break; + } + case ND_DEVICE_NAMESPACE_BLK: { + /* TODO: blk namespace support */ + break; + } + default: + break; + } + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, u8 *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -151,6 +204,60 @@ static ssize_t set_cookie_show(struct device *dev, } static DEVICE_ATTR_RO(set_cookie); +resource_size_t nd_region_available_dpa(struct nd_region *nd_region) +{ + resource_size_t blk_max_overlap = 0, available, overlap; + int i; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + + retry: + available = 0; + overlap = blk_max_overlap; + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + if (is_nd_pmem(&nd_region->dev)) { + available += nd_pmem_available_dpa(nd_region, + nd_mapping, &overlap); + if (overlap > blk_max_overlap) { + blk_max_overlap = overlap; + goto retry; + } + } else if (is_nd_blk(&nd_region->dev)) { + /* TODO: BLK Namespace support */ + } + } + + return available; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long available = 0; + + /* + * Flush in-flight updates and grab a snapshot of the available + * size. Of course, this value is potentially invalidated the + * memory nvdimm_bus_lock() is dropped, but that's userspace's + * problem to not race itself. + */ + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + available = nd_region_available_dpa(nd_region); + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%llu\n", available); +} +static DEVICE_ATTR_RO(available_size); + static ssize_t init_namespaces_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -168,11 +275,29 @@ static ssize_t init_namespaces_show(struct device *dev, } static DEVICE_ATTR_RO(init_namespaces); +static ssize_t namespace_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->ns_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(namespace_seed); + static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, &dev_attr_nstype.attr, &dev_attr_mappings.attr, &dev_attr_set_cookie.attr, + &dev_attr_available_size.attr, + &dev_attr_namespace_seed.attr, &dev_attr_init_namespaces.attr, NULL, }; @@ -182,12 +307,18 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) struct device *dev = container_of(kobj, typeof(*dev), kobj); struct nd_region *nd_region = to_nd_region(dev); struct nd_interleave_set *nd_set = nd_region->nd_set; + int type = nd_region_to_nstype(nd_region); - if (a != &dev_attr_set_cookie.attr) + if (a != &dev_attr_set_cookie.attr + && a != &dev_attr_available_size.attr) return a->mode; - if (is_nd_pmem(dev) && nd_set) - return a->mode; + if ((type == ND_DEVICE_NAMESPACE_PMEM + || type == ND_DEVICE_NAMESPACE_BLK) + && a == &dev_attr_available_size.attr) + return a->mode; + else if (is_nd_pmem(dev) && nd_set) + return a->mode; return 0; } @@ -198,6 +329,15 @@ struct attribute_group nd_region_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_region_attribute_group); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) +{ + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (nd_set) + return nd_set->cookie; + return 0; +} + /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present) @@ -205,18 +345,20 @@ EXPORT_SYMBOL_GPL(nd_region_attribute_group); static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct device *dev, bool probe) { - if (is_nd_pmem(dev) || is_nd_blk(dev)) { + if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) { struct nd_region *nd_region = to_nd_region(dev); int i; for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = nd_mapping->ndd; struct nvdimm *nvdimm = nd_mapping->nvdimm; - if (probe) - atomic_inc(&nvdimm->busy); - else - atomic_dec(&nvdimm->busy); + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + put_ndd(ndd); + nd_mapping->ndd = NULL; + atomic_dec(&nvdimm->busy); } } } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 1b627b109360..c130972e08c4 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -41,10 +41,20 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_namespace_label; +struct nvdimm_drvdata; struct nd_mapping { struct nvdimm *nvdimm; + struct nd_namespace_label **labels; u64 start; u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; }; struct nvdimm_bus_descriptor { diff --git a/include/linux/nd.h b/include/linux/nd.h index da70e9962197..255c38a83083 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -28,16 +28,40 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_io - infrastructure for loading an nd_pmem instance + * @dev: namespace device created by the nd region driver + * @res: struct resource conversion of a NFIT SPA table + */ struct nd_namespace_io { struct device dev; struct resource res; }; +/** + * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory + * @nsio: device and system physical address range to drive + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + */ +struct nd_namespace_pmem { + struct nd_namespace_io nsio; + char *alt_name; + u8 *uuid; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); } +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return container_of(nsio, struct nd_namespace_pmem, nsio); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 1357a87b8714..2b94ea2287bb 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -190,4 +190,8 @@ enum nd_driver_flags { ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM, ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK, }; + +enum { + ND_MIN_NAMESPACE_SIZE = 0x00400000, +}; #endif /* __NDCTL_H__ */ -- cgit v1.2.3-59-g8ed1b From 1b40e09a1232de537b193fa1b6b3ef16d3a1e397 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:34:01 -0400 Subject: libnvdimm: blk labels and namespace instantiation A blk label set describes a namespace comprised of one or more discontiguous dpa ranges on a single dimm. They may alias with one or more pmem interleave sets that include the given dimm. This is the runtime/volatile configuration infrastructure for sysfs manipulation of 'alt_name', 'uuid', 'size', and 'sector_size'. A later patch will make these settings persistent by writing back the label(s). Unlike pmem namespaces, multiple blk namespaces can be created per region. Once a blk namespace has been created a new seed device (unconfigured child of a parent blk region) is instantiated. As long as a region has 'available_size' != 0 new child namespaces may be created. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/nvdimm/core.c | 40 ++++ drivers/nvdimm/dimm_devs.c | 36 +++ drivers/nvdimm/namespace_devs.c | 498 +++++++++++++++++++++++++++++++++++++--- drivers/nvdimm/nd-core.h | 8 + drivers/nvdimm/nd.h | 5 + drivers/nvdimm/region_devs.c | 17 +- include/linux/libnvdimm.h | 3 + include/linux/nd.h | 25 ++ 8 files changed, 594 insertions(+), 38 deletions(-) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index cf99cce8ef33..dd824d7c2669 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -173,6 +173,46 @@ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, return 0; } +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf) +{ + ssize_t len = 0; + int i; + + for (i = 0; supported[i]; i++) + if (current_lbasize == supported[i]) + len += sprintf(buf + len, "[%ld] ", supported[i]); + else + len += sprintf(buf + len, "%ld ", supported[i]); + len += sprintf(buf + len, "\n"); + return len; +} + +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported) +{ + unsigned long lbasize; + int rc, i; + + if (dev->driver) + return -EBUSY; + + rc = kstrtoul(buf, 0, &lbasize); + if (rc) + return rc; + + for (i = 0; supported[i]; i++) + if (lbasize == supported[i]) + break; + + if (supported[i]) { + *current_lbasize = lbasize; + return 0; + } else { + return -EINVAL; + } +} + static ssize_t commands_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index b55acef179ba..101d3b76e405 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -289,6 +289,42 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); +/** + * nd_blk_available_dpa - account the unused dpa of BLK region + * @nd_mapping: container of dpa-resource-root + labels + * + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + */ +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t map_end, busy = 0, available; + struct resource *res; + + if (!ndd) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + for_each_dpa_resource(ndd, res) + if (res->start >= nd_mapping->start && res->start < map_end) { + resource_size_t end = min(map_end, res->end); + + busy += end - res->start + 1; + } else if (res->end >= nd_mapping->start + && res->end <= map_end) { + busy += res->end - nd_mapping->start; + } else if (nd_mapping->start > res->start + && nd_mapping->start < res->end) { + /* total eclipse of the BLK region mapping */ + busy += nd_mapping->size; + } + + available = map_end - nd_mapping->start + 1; + if (busy < available) + return available - busy; + return 0; +} + /** * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa * @nd_mapping: container of dpa-resource-root + labels diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 5d81032fcfc5..ad0ec09ca40f 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -35,7 +35,15 @@ static void namespace_pmem_release(struct device *dev) static void namespace_blk_release(struct device *dev) { - /* TODO: blk namespace support */ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + + if (nsblk->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nsblk->id); + kfree(nsblk->alt_name); + kfree(nsblk->uuid); + kfree(nsblk->res); + kfree(nsblk); } static struct device_type namespace_io_device_type = { @@ -88,8 +96,9 @@ static ssize_t __alt_name_store(struct device *dev, const char *buf, ns_altname = &nspm->alt_name; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = &nsblk->alt_name; } else return -ENXIO; @@ -122,6 +131,24 @@ out: return rc; } +static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + resource_size_t size = 0; + struct resource *res; + + if (!nsblk->uuid) + return 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + size += resource_size(res); + return size; +} + static ssize_t alt_name_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -148,8 +175,9 @@ static ssize_t alt_name_show(struct device *dev, ns_altname = nspm->alt_name; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = nsblk->alt_name; } else return -ENXIO; @@ -195,6 +223,8 @@ static int scan_free(struct nd_region *nd_region, new_start = res->start; rc = adjust_resource(res, new_start, resource_size(res) - n); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc); break; } @@ -255,14 +285,15 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, return rc ? n : 0; } -static bool space_valid(bool is_pmem, struct nd_label_id *label_id, - struct resource *res) +static bool space_valid(bool is_pmem, bool is_reserve, + struct nd_label_id *label_id, struct resource *res) { /* * For BLK-space any space is valid, for PMEM-space, it must be - * contiguous with an existing allocation. + * contiguous with an existing allocation unless we are + * reserving pmem. */ - if (!is_pmem) + if (is_reserve || !is_pmem) return true; if (!res || strcmp(res->name, label_id->id) == 0) return true; @@ -278,6 +309,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, resource_size_t n) { resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); const resource_size_t to_allocate = n; @@ -303,7 +335,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (!first++ && res->start > nd_mapping->start) { free_start = nd_mapping->start; available = res->start - free_start; - if (space_valid(is_pmem, label_id, NULL)) + if (space_valid(is_pmem, is_reserve, label_id, NULL)) loc = ALLOC_BEFORE; } @@ -311,7 +343,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (!loc && next) { free_start = res->start + resource_size(res); free_end = min(mapping_end, next->start - 1); - if (space_valid(is_pmem, label_id, res) + if (space_valid(is_pmem, is_reserve, label_id, res) && free_start < free_end) { available = free_end + 1 - free_start; loc = ALLOC_MID; @@ -322,7 +354,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (!loc && !next) { free_start = res->start + resource_size(res); free_end = mapping_end; - if (space_valid(is_pmem, label_id, res) + if (space_valid(is_pmem, is_reserve, label_id, res) && free_start < free_end) { available = free_end + 1 - free_start; loc = ALLOC_AFTER; @@ -336,7 +368,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_BEFORE: if (strcmp(res->name, label_id->id) == 0) { /* adjust current resource up */ - if (is_pmem) + if (is_pmem && !is_reserve) return n; rc = adjust_resource(res, res->start - allocate, resource_size(res) + allocate); @@ -347,7 +379,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_MID: if (strcmp(next->name, label_id->id) == 0) { /* adjust next resource up */ - if (is_pmem) + if (is_pmem && !is_reserve) return n; rc = adjust_resource(next, next->start - allocate, resource_size(next) @@ -373,7 +405,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, /* BLK allocate bottom up */ if (!is_pmem) free_start += available - allocate; - else if (free_start != nd_mapping->start) + else if (!is_reserve && free_start != nd_mapping->start) return n; new_res = nvdimm_allocate_dpa(ndd, label_id, @@ -384,6 +416,8 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, /* adjust current resource down */ rc = adjust_resource(res, res->start, resource_size(res) + allocate); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; } if (!new_res) @@ -409,11 +443,108 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, return 0; } - if (is_pmem && n == to_allocate) + /* + * If we allocated nothing in the BLK case it may be because we are in + * an initial "pmem-reserve pass". Only do an initial BLK allocation + * when none of the DPA space is reserved. + */ + if ((is_pmem || !ndd->dpa.child) && n == to_allocate) return init_dpa_allocation(label_id, nd_region, nd_mapping, n); return n; } +static int merge_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + if (strncmp("pmem", label_id->id, 4) == 0) + return 0; + retry: + for_each_dpa_resource(ndd, res) { + int rc; + struct resource *next = res->sibling; + resource_size_t end = res->start + resource_size(res); + + if (!next || strcmp(res->name, label_id->id) != 0 + || strcmp(next->name, label_id->id) != 0 + || end != next->start) + continue; + end += resource_size(next); + nvdimm_free_dpa(ndd, next); + rc = adjust_resource(res, res->start, end - res->start); + nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc); + if (rc) + return rc; + res->flags |= DPA_RESOURCE_ADJUSTED; + goto retry; + } + + return 0; +} + +static int __reserve_free_pmem(struct device *dev, void *data) +{ + struct nvdimm *nvdimm = data; + struct nd_region *nd_region; + struct nd_label_id label_id; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + if (nd_region->ndr_mappings == 0) + return 0; + + memset(&label_id, 0, sizeof(label_id)); + strcat(label_id.id, "pmem-reserve"); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t n, rem = 0; + + if (nd_mapping->nvdimm != nvdimm) + continue; + + n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem); + if (n == 0) + return 0; + rem = scan_allocate(nd_region, nd_mapping, &label_id, n); + dev_WARN_ONCE(&nd_region->dev, rem, + "pmem reserve underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + return rem ? -ENXIO : 0; + } + + return 0; +} + +static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *_res; + + for_each_dpa_resource_safe(ndd, res, _res) + if (strcmp(res->name, "pmem-reserve") == 0) + nvdimm_free_dpa(ndd, res); +} + +static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int rc; + + rc = device_for_each_child(&nvdimm_bus->dev, nvdimm, + __reserve_free_pmem); + if (rc) + release_free_pmem(nvdimm_bus, nd_mapping); + return rc; +} + /** * grow_dpa_allocation - for each dimm allocate n bytes for @label_id * @nd_region: the set of dimms to allocate @n more bytes from @@ -430,13 +561,45 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, static int grow_dpa_allocation(struct nd_region *nd_region, struct nd_label_id *label_id, resource_size_t n) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; int i; for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - int rc; + resource_size_t rem = n; + int rc, j; + + /* + * In the BLK case try once with all unallocated PMEM + * reserved, and once without + */ + for (j = is_pmem; j < 2; j++) { + bool blk_only = j == 0; + + if (blk_only) { + rc = reserve_free_pmem(nvdimm_bus, nd_mapping); + if (rc) + return rc; + } + rem = scan_allocate(nd_region, nd_mapping, + label_id, rem); + if (blk_only) + release_free_pmem(nvdimm_bus, nd_mapping); - rc = scan_allocate(nd_region, nd_mapping, label_id, n); + /* try again and allow encroachments into PMEM */ + if (rem == 0) + break; + } + + dev_WARN_ONCE(&nd_region->dev, rem, + "allocation underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + if (rem) + return -ENXIO; + + rc = merge_dpa(nd_region, nd_mapping, label_id); if (rc) return rc; } @@ -472,8 +635,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) uuid = nspm->uuid; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; } /* @@ -528,6 +693,14 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) nd_namespace_pmem_set_size(nd_region, nspm, val * nd_region->ndr_mappings); + } else if (is_namespace_blk(dev)) { + /* + * Try to delete the namespace if we deleted all of its + * allocation and this is not the seed device for the + * region. + */ + if (val == 0 && nd_region->ns_seed != dev) + nd_device_unregister(dev, ND_ASYNC); } return rc; @@ -554,8 +727,9 @@ static ssize_t size_store(struct device *dev, uuid = &nspm->uuid; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - rc = -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = &nsblk->uuid; } if (rc == 0 && val == 0 && uuid) { @@ -576,21 +750,23 @@ static ssize_t size_store(struct device *dev, static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { + unsigned long long size = 0; + + nvdimm_bus_lock(dev); if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - return sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&nspm->nsio.res)); + size = resource_size(&nspm->nsio.res); } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + size = nd_namespace_blk_size(to_nd_namespace_blk(dev)); } else if (is_namespace_io(dev)) { struct nd_namespace_io *nsio = to_nd_namespace_io(dev); - return sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&nsio->res)); - } else - return -ENXIO; + size = resource_size(&nsio->res); + } + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%llu\n", size); } static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); @@ -604,8 +780,9 @@ static ssize_t uuid_show(struct device *dev, uuid = nspm->uuid; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; } else return -ENXIO; @@ -669,8 +846,9 @@ static ssize_t uuid_store(struct device *dev, ns_uuid = &nspm->uuid; } else if (is_namespace_blk(dev)) { - /* TODO: blk namespace support */ - return -ENXIO; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_uuid = &nsblk->uuid; } else return -ENXIO; @@ -712,12 +890,48 @@ static ssize_t resource_show(struct device *dev, } static DEVICE_ATTR_RO(resource); +static const unsigned long ns_lbasize_supported[] = { 512, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + if (!is_namespace_blk(dev)) + return -ENXIO; + + return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + ssize_t rc; + + if (!is_namespace_blk(dev)) + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, + ns_lbasize_supported); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, &dev_attr_size.attr, &dev_attr_uuid.attr, &dev_attr_resource.attr, &dev_attr_alt_name.attr, + &dev_attr_sector_size.attr, NULL, }; @@ -735,6 +949,10 @@ static umode_t namespace_visible(struct kobject *kobj, if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { if (a == &dev_attr_size.attr) return S_IWUSR | S_IRUGO; + + if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr) + return 0; + return a->mode; } @@ -1022,6 +1240,176 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region) return NULL; } +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start) +{ + struct nd_label_id label_id; + struct resource *res; + + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + res = krealloc(nsblk->res, + sizeof(void *) * (nsblk->num_resources + 1), + GFP_KERNEL); + if (!res) + return NULL; + nsblk->res = (struct resource **) res; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0 + && res->start == start) { + nsblk->res[nsblk->num_resources++] = res; + return res; + } + return NULL; +} + +static struct device *nd_namespace_blk_create(struct nd_region *nd_region) +{ + struct nd_namespace_blk *nsblk; + struct device *dev; + + if (!is_nd_blk(&nd_region->dev)) + return NULL; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return NULL; + + dev = &nsblk->dev; + dev->type = &namespace_blk_device_type; + nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nsblk->id < 0) { + kfree(nsblk); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + + return &nsblk->dev; +} + +void nd_region_create_blk_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->ns_seed) + dev_err(&nd_region->dev, "failed to create blk namespace\n"); + else + nd_device_register(nd_region->ns_seed); +} + +static struct device **create_namespace_blk(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nd_namespace_label *nd_label; + struct device *dev, **devs = NULL; + struct nd_namespace_blk *nsblk; + struct nvdimm_drvdata *ndd; + int i, l, count = 0; + struct resource *res; + + if (nd_region->ndr_mappings == 0) + return NULL; + + ndd = to_ndd(nd_mapping); + for_each_label(l, nd_label, nd_mapping->labels) { + u32 flags = __le32_to_cpu(nd_label->flags); + char *name[NSLABEL_NAME_LEN]; + struct device **__devs; + + if (flags & NSLABEL_FLAG_LOCAL) + /* pass */; + else + continue; + + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + if (memcmp(nsblk->uuid, nd_label->uuid, + NSLABEL_UUID_LEN) == 0) { + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->dev)); + break; + } + } + if (i < count) + continue; + __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); + if (!__devs) + goto err; + memcpy(__devs, devs, sizeof(dev) * count); + kfree(devs); + devs = __devs; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + dev_set_name(dev, "namespace%d.%d", nd_region->id, count); + devs[count++] = dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->dev)); + } + + dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", + __func__, count, count == 1 ? "" : "s"); + + if (count == 0) { + /* Publish a zero-sized namespace for userspace to configure. */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + devs = kcalloc(2, sizeof(dev), GFP_KERNEL); + if (!devs) + goto err; + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + devs[count++] = dev; + } + + return devs; + +err: + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + namespace_blk_release(&nsblk->dev); + } + kfree(devs); + return NULL; +} + static int init_active_labels(struct nd_region *nd_region) { int i; @@ -1087,6 +1475,9 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) case ND_DEVICE_NAMESPACE_PMEM: devs = create_namespace_pmem(nd_region); break; + case ND_DEVICE_NAMESPACE_BLK: + devs = create_namespace_blk(nd_region); + break; default: break; } @@ -1095,15 +1486,50 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) if (!devs) return -ENODEV; - nd_region->ns_seed = devs[0]; for (i = 0; devs[i]; i++) { struct device *dev = devs[i]; + int id; - dev_set_name(dev, "namespace%d.%d", nd_region->id, i); + if (type == ND_DEVICE_NAMESPACE_BLK) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nsblk->id = id; + } else + id = i; + + if (id < 0) + break; + dev_set_name(dev, "namespace%d.%d", nd_region->id, id); dev->groups = nd_namespace_attribute_groups; nd_device_register(dev); } + if (i) + nd_region->ns_seed = devs[0]; + + if (devs[i]) { + int j; + + for (j = i; devs[j]; j++) { + struct device *dev = devs[j]; + + device_initialize(dev); + put_device(dev); + } + *err = j - i; + /* + * All of the namespaces we tried to register failed, so + * fail region activation. + */ + if (*err == 0) + rc = -ENODEV; + } kfree(devs); + if (rc == -ENODEV) + return rc; + return i; } diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index c6c889292bab..22489555a6f1 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -17,6 +17,7 @@ #include #include #include +#include extern struct list_head nvdimm_bus_list; extern struct mutex nvdimm_bus_list_mutex; @@ -48,6 +49,8 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); +struct nd_region; +void nd_region_create_blk_seed(struct nd_region *nd_region); void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); @@ -64,8 +67,13 @@ struct nvdimm_drvdata; struct nd_mapping; resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); +struct nd_mapping; +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start); void get_ndd(struct nvdimm_drvdata *ndd); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 03e610cd9f43..9b021b626202 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -73,6 +73,7 @@ static inline struct nd_namespace_index *to_next_namespace_index( struct nd_region { struct device dev; + struct ida ns_ida; struct device *ns_seed; u16 ndr_mappings; u64 ndr_size; @@ -102,6 +103,10 @@ void nd_device_register(struct device *dev); void nd_device_unregister(struct device *dev, enum nd_async_mode mode); int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, size_t len); +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf); +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported); int __init nvdimm_init(void); int __init nd_region_init(void); void nvdimm_exit(void); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index b45806f7176d..ac21ce419beb 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -118,7 +118,12 @@ static int is_uuid_busy(struct device *dev, void *data) break; } case ND_DEVICE_NAMESPACE_BLK: { - /* TODO: blk namespace support */ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + if (!nsblk->uuid) + break; + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0) + return -EBUSY; break; } default: @@ -230,7 +235,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) goto retry; } } else if (is_nd_blk(&nd_region->dev)) { - /* TODO: BLK Namespace support */ + available += nd_blk_available_dpa(nd_mapping); } } @@ -360,6 +365,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_mapping->ndd = NULL; atomic_dec(&nvdimm->busy); } + } else if (dev->parent && is_nd_blk(dev->parent) && probe) { + struct nd_region *nd_region = to_nd_region(dev->parent); + + nvdimm_bus_lock(dev); + if (nd_region->ns_seed == dev) + nd_region_create_blk_seed(nd_region); + nvdimm_bus_unlock(dev); } } @@ -533,6 +545,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->ndr_mappings = ndr_desc->num_mappings; nd_region->provider_data = ndr_desc->provider_data; nd_region->nd_set = ndr_desc->nd_set; + ida_init(&nd_region->ns_ida); dev = &nd_region->dev; dev_set_name(dev, "region%d", nd_region->id); dev->parent = &nvdimm_bus->dev; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index c130972e08c4..a59dca17b3aa 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -27,6 +27,9 @@ enum { ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, ND_MAX_MAPPINGS = 32, + + /* mark newly adjusted resources as requiring a label update */ + DPA_RESOURCE_ADJUSTED = 1 << 0, }; extern struct attribute_group nvdimm_bus_attribute_group; diff --git a/include/linux/nd.h b/include/linux/nd.h index 255c38a83083..23276ea91690 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -50,6 +50,26 @@ struct nd_namespace_pmem { u8 *uuid; }; +/** + * struct nd_namespace_blk - namespace for dimm-bounded persistent memory + * @dev: namespace device creation by the nd region driver + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id + * @lbasize: blk namespaces have a native sector size when btt not present + * @num_resources: number of dpa extents to claim + * @res: discontiguous dpa extents for given dimm + */ +struct nd_namespace_blk { + struct device dev; + char *alt_name; + u8 *uuid; + int id; + unsigned long lbasize; + int num_resources; + struct resource **res; +}; + static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { return container_of(dev, struct nd_namespace_io, dev); @@ -62,6 +82,11 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) return container_of(nsio, struct nd_namespace_pmem, nsio); } +static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +{ + return container_of(dev, struct nd_namespace_blk, dev); +} + #define MODULE_ALIAS_ND_DEVICE(type) \ MODULE_ALIAS("nd:t" __stringify(type) "*") #define ND_DEVICE_MODALIAS_FMT "nd:t%d" -- cgit v1.2.3-59-g8ed1b From f524bf271a5cf12a44253194abcf8b6688ff5b9d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 30 May 2015 12:36:02 -0400 Subject: libnvdimm: write pmem label set After 'uuid', 'size', and optionally 'alt_name' have been set to valid values the labels on the dimms can be updated. Write procedure is: 1/ Allocate and write new labels in the "next" index 2/ Free the old labels in the working copy 3/ Write the bitmap and the label space on the dimm 4/ Write the index to make the update valid Label ranges directly mirror the dpa resource values for the given label_id of the namespace. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/nvdimm/dimm_devs.c | 49 ++++++ drivers/nvdimm/label.c | 328 +++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/label.h | 6 + drivers/nvdimm/namespace_devs.c | 83 ++++++++-- drivers/nvdimm/nd.h | 3 + 5 files changed, 455 insertions(+), 14 deletions(-) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 101d3b76e405..156d518a089c 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -132,6 +132,55 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) return rc; } +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len) +{ + int rc = validate_dimm(ndd); + size_t max_cmd_size, buf_offset; + struct nd_cmd_set_config_hdr *cmd; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + if (rc) + return rc; + + if (!ndd->data) + return -ENXIO; + + if (offset + len > ndd->nsarea.config_size) + return -ENXIO; + + max_cmd_size = min_t(u32, PAGE_SIZE, len); + max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + for (buf_offset = 0; len; len -= cmd->in_length, + buf_offset += cmd->in_length) { + size_t cmd_size; + u32 *status; + + cmd->in_offset = offset + buf_offset; + cmd->in_length = min(max_cmd_size, len); + memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length); + + /* status is output in the last 4-bytes of the command buffer */ + cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32); + status = ((void *) cmd) + cmd_size - sizeof(u32); + + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_SET_CONFIG_DATA, cmd, cmd_size); + if (rc || *status) { + rc = rc ? rc : -ENXIO; + break; + } + } + kfree(cmd); + + return rc; +} + static void nvdimm_release(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 1a3bcd27a57a..ffa85d700459 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include "nd-core.h" @@ -55,6 +56,11 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) return ndd->nsindex_size; } +static int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) +{ + return ndd->nsarea.config_size / 129; +} + int nd_label_validate(struct nvdimm_drvdata *ndd) { /* @@ -201,25 +207,32 @@ static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd) return base + 2 * sizeof_namespace_index(ndd); } +static int to_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return nd_label - nd_label_base(ndd); +} + #define for_each_clear_bit_le(bit, addr, size) \ for ((bit) = find_next_zero_bit_le((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1)) /** - * preamble_current - common variable initialization for nd_label_* routines + * preamble_index - common variable initialization for nd_label_* routines * @ndd: dimm container for the relevant label set + * @idx: namespace_index index * @nsindex_out: on return set to the currently active namespace index * @free: on return set to the free label bitmap in the index * @nslot: on return set to the number of slots in the label space */ -static bool preamble_current(struct nvdimm_drvdata *ndd, +static bool preamble_index(struct nvdimm_drvdata *ndd, int idx, struct nd_namespace_index **nsindex_out, unsigned long **free, u32 *nslot) { struct nd_namespace_index *nsindex; - nsindex = to_current_namespace_index(ndd); + nsindex = to_namespace_index(ndd, idx); if (nsindex == NULL) return false; @@ -239,6 +252,22 @@ char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags) return label_id->id; } +static bool preamble_current(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_current, nsindex, + free, nslot); +} + +static bool preamble_next(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_next, nsindex, + free, nslot); +} + static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot) { /* check that we are written where we expect to be written */ @@ -341,3 +370,296 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) return NULL; } + +static u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return UINT_MAX; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + slot = find_next_bit_le(free, nslot, 0); + if (slot == nslot) + return UINT_MAX; + + clear_bit_le(slot, free); + + return slot; +} + +static bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return false; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (slot < nslot) + return !test_and_set_bit_le(slot, free); + return false; +} + +u32 nd_label_nfree(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return 0; + + return bitmap_weight(free, nslot); +} + +static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, + unsigned long flags) +{ + struct nd_namespace_index *nsindex; + unsigned long offset; + u64 checksum; + u32 nslot; + int rc; + + nsindex = to_namespace_index(ndd, index); + if (flags & ND_NSINDEX_INIT) + nslot = nvdimm_num_label_slots(ndd); + else + nslot = __le32_to_cpu(nsindex->nslot); + + memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN); + nsindex->flags = __cpu_to_le32(0); + nsindex->seq = __cpu_to_le32(seq); + offset = (unsigned long) nsindex + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->myoff = __cpu_to_le64(offset); + nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd)); + offset = (unsigned long) to_namespace_index(ndd, + nd_label_next_nsindex(index)) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->otheroff = __cpu_to_le64(offset); + offset = (unsigned long) nd_label_base(ndd) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->labeloff = __cpu_to_le64(offset); + nsindex->nslot = __cpu_to_le32(nslot); + nsindex->major = __cpu_to_le16(1); + nsindex->minor = __cpu_to_le16(1); + nsindex->checksum = __cpu_to_le64(0); + if (flags & ND_NSINDEX_INIT) { + unsigned long *free = (unsigned long *) nsindex->free; + u32 nfree = ALIGN(nslot, BITS_PER_LONG); + int last_bits, i; + + memset(nsindex->free, 0xff, nfree / 8); + for (i = 0, last_bits = nfree - nslot; i < last_bits; i++) + clear_bit_le(nslot + i, free); + } + checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1); + nsindex->checksum = __cpu_to_le64(checksum); + rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff), + nsindex, sizeof_namespace_index(ndd)); + if (rc < 0) + return rc; + + if (flags & ND_NSINDEX_INIT) + return 0; + + /* copy the index we just wrote to the new 'next' */ + WARN_ON(index != ndd->ns_next); + nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex); + ndd->ns_current = nd_label_next_nsindex(ndd->ns_current); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_next); + WARN_ON(ndd->ns_current == ndd->ns_next); + + return 0; +} + +static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return (unsigned long) nd_label + - (unsigned long) to_namespace_index(ndd, 0); +} + +static int __pmem_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, + int pos) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *victim_label; + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + size_t offset; + int rc; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + /* allocate and write the label to the staging (next) index */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + return -ENXIO; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN); + if (nspm->alt_name) + memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_UPDATING); + nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); + nd_label->position = __cpu_to_le16(pos); + nd_label->isetcookie = __cpu_to_le64(cookie); + rawsize = div_u64(resource_size(&nspm->nsio.res), + nd_region->ndr_mappings); + nd_label->rawsize = __cpu_to_le64(rawsize); + nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + return rc; + + /* Garbage collect the previous label */ + victim_label = nd_mapping->labels[0]; + if (victim_label) { + slot = to_slot(ndd, victim_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc < 0) + return rc; + + nd_mapping->labels[0] = nd_label; + + return 0; +} + +static int init_labels(struct nd_mapping *nd_mapping) +{ + int i; + struct nd_namespace_index *nsindex; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + if (!nd_mapping->labels) + nd_mapping->labels = kcalloc(2, sizeof(void *), GFP_KERNEL); + + if (!nd_mapping->labels) + return -ENOMEM; + + if (ndd->ns_current == -1 || ndd->ns_next == -1) + /* pass */; + else + return 0; + + nsindex = to_namespace_index(ndd, 0); + memset(nsindex, 0, ndd->nsarea.config_size); + for (i = 0; i < 2; i++) { + int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + + if (rc) + return rc; + } + ndd->ns_next = 1; + ndd->ns_current = 0; + + return 0; +} + +static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + u8 label_uuid[NSLABEL_UUID_LEN]; + int l, num_freed = 0; + unsigned long *free; + u32 nslot, slot; + + if (!uuid) + return 0; + + /* no index || no labels == nothing to delete */ + if (!preamble_next(ndd, &nsindex, &free, &nslot) + || !nd_mapping->labels) + return 0; + + for_each_label(l, nd_label, nd_mapping->labels) { + int j; + + memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + slot = to_slot(ndd, nd_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + for (j = l; nd_mapping->labels[j + 1]; j++) { + struct nd_namespace_label *next_label; + + next_label = nd_mapping->labels[j + 1]; + nd_mapping->labels[j] = next_label; + } + nd_mapping->labels[j] = NULL; + num_freed++; + } + + if (num_freed > l) { + /* + * num_freed will only ever be > l when we delete the last + * label + */ + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + } + + return nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); +} + +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + if (size == 0) { + rc = del_labels(nd_mapping, nspm->uuid); + if (rc) + return rc; + continue; + } + + rc = init_labels(nd_mapping); + if (rc) + return rc; + + rc = __pmem_label_update(nd_region, nd_mapping, nspm, i); + if (rc) + return rc; + } + + return 0; +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index 8ee1376526c7..6d376be31937 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -34,6 +34,7 @@ enum { BTTINFO_MAJOR_VERSION = 1, ND_LABEL_MIN_SIZE = 512 * 129, /* see sizeof_namespace_index() */ ND_LABEL_ID_SIZE = 50, + ND_NSINDEX_INIT = 0x1, }; static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0"; @@ -127,4 +128,9 @@ void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); int nd_label_active_count(struct nvdimm_drvdata *ndd); struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); +u32 nd_label_nfree(struct nvdimm_drvdata *ndd); +struct nd_region; +struct nd_namespace_pmem; +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size); #endif /* __LABEL_H__ */ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index ad0ec09ca40f..546e77e122fd 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -149,20 +149,53 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) return size; } +static int nd_namespace_label_update(struct nd_region *nd_region, + struct device *dev) +{ + dev_WARN_ONCE(dev, dev->driver, + "namespace must be idle during label update\n"); + if (dev->driver) + return 0; + + /* + * Only allow label writes that will result in a valid namespace + * or deletion of an existing namespace. + */ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + struct resource *res = &nspm->nsio.res; + resource_size_t size = resource_size(res); + + if (size == 0 && nspm->uuid) + /* delete allocation */; + else if (!nspm->uuid) + return 0; + + return nd_pmem_namespace_label_update(nd_region, nspm, size); + } else if (is_namespace_blk(dev)) { + /* TODO: implement blk labels */ + return 0; + } else + return -ENXIO; +} + static ssize_t alt_name_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct nd_region *nd_region = to_nd_region(dev->parent); ssize_t rc; device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __alt_name_store(dev, buf, len); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc); nvdimm_bus_unlock(dev); device_unlock(dev); - return rc; + return rc < 0 ? rc : len; } static ssize_t alt_name_show(struct device *dev, @@ -709,6 +742,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) static ssize_t size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct nd_region *nd_region = to_nd_region(dev->parent); unsigned long long val; u8 **uuid = NULL; int rc; @@ -721,6 +755,8 @@ static ssize_t size_store(struct device *dev, nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); rc = __size_store(dev, val); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); @@ -744,7 +780,7 @@ static ssize_t size_store(struct device *dev, nvdimm_bus_unlock(dev); device_unlock(dev); - return rc ? rc : len; + return rc < 0 ? rc : len; } static ssize_t size_show(struct device *dev, @@ -804,17 +840,34 @@ static int namespace_update_uuid(struct nd_region *nd_region, u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0; struct nd_label_id old_label_id; struct nd_label_id new_label_id; - int i, rc; + int i; - rc = nd_is_uuid_unique(dev, new_uuid) ? 0 : -EINVAL; - if (rc) { - kfree(new_uuid); - return rc; - } + if (!nd_is_uuid_unique(dev, new_uuid)) + return -EINVAL; if (*old_uuid == NULL) goto out; + /* + * If we've already written a label with this uuid, then it's + * too late to rename because we can't reliably update the uuid + * without losing the old namespace. Userspace must delete this + * namespace to abandon the old uuid. + */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + /* + * This check by itself is sufficient because old_uuid + * would be NULL above if this uuid did not exist in the + * currently written set. + * + * FIXME: can we delete uuid with zero dpa allocated? + */ + if (nd_mapping->labels) + return -EBUSY; + } + nd_label_gen_id(&old_label_id, *old_uuid, flags); nd_label_gen_id(&new_label_id, new_uuid, flags); for (i = 0; i < nd_region->ndr_mappings; i++) { @@ -858,12 +911,16 @@ static ssize_t uuid_store(struct device *dev, rc = nd_uuid_store(dev, &uuid, buf, len); if (rc >= 0) rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + else + kfree(uuid); dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); device_unlock(dev); - return rc ? rc : len; + return rc < 0 ? rc : len; } static DEVICE_ATTR_RW(uuid); @@ -907,6 +964,7 @@ static ssize_t sector_size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); ssize_t rc; if (!is_namespace_blk(dev)) @@ -916,8 +974,11 @@ static ssize_t sector_size_store(struct device *dev, nvdimm_bus_lock(dev); rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, ns_lbasize_supported); - dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, - rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, + rc, rc < 0 ? "tried" : "wrote", buf, + buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); device_unlock(dev); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 9b021b626202..bfa849617358 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -93,6 +93,7 @@ static inline unsigned nd_inc_seq(unsigned seq) return next[seq & 3]; } + enum nd_async_mode { ND_SYNC, ND_ASYNC, @@ -115,6 +116,8 @@ struct nvdimm; struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len); struct nd_region *to_nd_region(struct device *dev); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); -- cgit v1.2.3-59-g8ed1b From 0ba1c634892b3590779803a701bcb82e8c32cc7a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 30 May 2015 12:35:36 -0400 Subject: libnvdimm: write blk label set After 'uuid', 'size', 'sector_size', and optionally 'alt_name' have been set to valid values the labels on the dimm can be updated. The difference with the pmem case is that blk namespaces are limited to one dimm and can cover discontiguous ranges in dpa space. Also, after allocating label slots, it is useful for userspace to know how many slots are left. Export this information in sysfs. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 4 + drivers/nvdimm/dimm_devs.c | 25 ++++ drivers/nvdimm/label.c | 301 +++++++++++++++++++++++++++++++++++++--- drivers/nvdimm/label.h | 5 + drivers/nvdimm/namespace_devs.c | 57 +++++++- drivers/nvdimm/nd-core.h | 1 + 6 files changed, 369 insertions(+), 24 deletions(-) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index fddc3f2a8f80..ca802702440e 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -155,6 +155,10 @@ static void nd_async_device_unregister(void *d, async_cookie_t cookie) { struct device *dev = d; + /* flush bus operations before delete */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + device_unregister(dev); put_device(dev); } diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 156d518a089c..83b179ed6d61 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -19,6 +19,7 @@ #include #include #include "nd-core.h" +#include "label.h" #include "nd.h" static DEFINE_IDA(dimm_ida); @@ -296,9 +297,33 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(state); +static ssize_t available_slots_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + ssize_t rc; + u32 nfree; + + if (!ndd) + return -ENXIO; + + nvdimm_bus_lock(dev); + nfree = nd_label_nfree(ndd); + if (nfree - 1 > nfree) { + dev_WARN_ONCE(dev, 1, "we ate our last label?\n"); + nfree = 0; + } else + nfree--; + rc = sprintf(buf, "%d\n", nfree); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(available_slots); + static struct attribute *nvdimm_attributes[] = { &dev_attr_state.attr, &dev_attr_commands.attr, + &dev_attr_available_slots.attr, NULL, }; diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index ffa85d700459..34148003fc73 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -56,7 +56,7 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) return ndd->nsindex_size; } -static int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) { return ndd->nsarea.config_size / 129; } @@ -371,7 +371,7 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) return NULL; } -static u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) { struct nd_namespace_index *nsindex; unsigned long *free; @@ -391,7 +391,7 @@ static u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) return slot; } -static bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) { struct nd_namespace_index *nsindex; unsigned long *free; @@ -416,7 +416,7 @@ u32 nd_label_nfree(struct nvdimm_drvdata *ndd) WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); if (!preamble_next(ndd, &nsindex, &free, &nslot)) - return 0; + return nvdimm_num_label_slots(ndd); return bitmap_weight(free, nslot); } @@ -554,22 +554,270 @@ static int __pmem_label_update(struct nd_region *nd_region, return 0; } -static int init_labels(struct nd_mapping *nd_mapping) +static void del_label(struct nd_mapping *nd_mapping, int l) +{ + struct nd_namespace_label *next_label, *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + unsigned int slot; + int j; + + nd_label = nd_mapping->labels[l]; + slot = to_slot(ndd, nd_label); + dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); + + for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) + nd_mapping->labels[j] = next_label; + nd_mapping->labels[j] = NULL; +} + +static bool is_old_resource(struct resource *res, struct resource **list, int n) { int i; + + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + for (i = 0; i < n; i++) + if (res == list[i]) + return true; + return false; +} + +static struct resource *to_resource(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + struct resource *res; + + for_each_dpa_resource(ndd, res) { + if (res->start != __le64_to_cpu(nd_label->dpa)) + continue; + if (resource_size(res) != __le64_to_cpu(nd_label->rawsize)) + continue; + return res; + } + + return NULL; +} + +/* + * 1/ Account all the labels that can be freed after this update + * 2/ Allocate and write the label to the staging (next) index + * 3/ Record the resources in the namespace device + */ +static int __blk_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, + int num_labels) +{ + int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free, *victim_map = NULL; + struct resource *res, **old_res_list; + struct nd_label_id label_id; + u8 uuid[NSLABEL_UUID_LEN]; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + old_res_list = nsblk->res; + nfree = nd_label_nfree(ndd); + old_num_resources = nsblk->num_resources; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + + /* + * We need to loop over the old resources a few times, which seems a + * bit inefficient, but we need to know that we have the label + * space before we start mutating the tracking structures. + * Otherwise the recovery method of last resort for userspace is + * disable and re-enable the parent region. + */ + alloc = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!is_old_resource(res, old_res_list, old_num_resources)) + alloc++; + } + + victims = 0; + if (old_num_resources) { + /* convert old local-label-map to dimm-slot victim-map */ + victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long), + GFP_KERNEL); + if (!victim_map) + return -ENOMEM; + + /* mark unused labels for garbage collection */ + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + if (res && is_old_resource(res, old_res_list, + old_num_resources)) + continue; + slot = to_slot(ndd, nd_label); + set_bit(slot, victim_map); + victims++; + } + } + + /* don't allow updates that consume the last label */ + if (nfree - alloc < 0 || nfree - alloc + victims < 1) { + dev_info(&nsblk->dev, "insufficient label space\n"); + kfree(victim_map); + return -ENOSPC; + } + /* from here on we need to abort on error */ + + + /* assign all resources to the namespace before writing the labels */ + nsblk->res = NULL; + nsblk->num_resources = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) { + rc = -ENOMEM; + goto abort; + } + } + + for (i = 0; i < nsblk->num_resources; i++) { + size_t offset; + + res = nsblk->res[i]; + if (is_old_resource(res, old_res_list, old_num_resources)) + continue; /* carry-over */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + goto abort; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN); + if (nsblk->alt_name) + memcpy(nd_label->name, nsblk->alt_name, + NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL); + nd_label->nlabel = __cpu_to_le16(0); /* N/A */ + nd_label->position = __cpu_to_le16(0); /* N/A */ + nd_label->isetcookie = __cpu_to_le64(0); /* N/A */ + nd_label->dpa = __cpu_to_le64(res->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->lbasize = __cpu_to_le64(nsblk->lbasize); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + goto abort; + } + + /* free up now unused slots in the new index */ + for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) { + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + nd_label_free_slot(ndd, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc) + goto abort; + + /* + * Now that the on-dimm labels are up to date, fix up the tracking + * entries in nd_mapping->labels + */ + nlabel = 0; + for_each_label(l, nd_label, nd_mapping->labels) { + nlabel++; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + nlabel--; + del_label(nd_mapping, l); + l--; /* retry with the new label at this index */ + } + if (nlabel + nsblk->num_resources > num_labels) { + /* + * Bug, we can't end up with more resources than + * available labels + */ + WARN_ON_ONCE(1); + rc = -ENXIO; + goto out; + } + + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + res->flags &= ~DPA_RESOURCE_ADJUSTED; + dev_vdbg(&nsblk->dev, "assign label[%d] slot: %d\n", l, slot); + nd_mapping->labels[l++] = nd_label; + } + nd_mapping->labels[l] = NULL; + + out: + kfree(old_res_list); + kfree(victim_map); + return rc; + + abort: + /* + * 1/ repair the allocated label bitmap in the index + * 2/ restore the resource list + */ + nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd)); + kfree(nsblk->res); + nsblk->res = old_res_list; + nsblk->num_resources = old_num_resources; + old_res_list = NULL; + goto out; +} + +static int init_labels(struct nd_mapping *nd_mapping, int num_labels) +{ + int i, l, old_num_labels = 0; struct nd_namespace_index *nsindex; + struct nd_namespace_label *nd_label; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); - if (!nd_mapping->labels) - nd_mapping->labels = kcalloc(2, sizeof(void *), GFP_KERNEL); + for_each_label(l, nd_label, nd_mapping->labels) + old_num_labels++; + /* + * We need to preserve all the old labels for the mapping so + * they can be garbage collected after writing the new labels. + */ + if (num_labels > old_num_labels) { + struct nd_namespace_label **labels; + + labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); + if (!labels) + return -ENOMEM; + nd_mapping->labels = labels; + } if (!nd_mapping->labels) return -ENOMEM; + for (i = old_num_labels; i <= num_labels; i++) + nd_mapping->labels[i] = NULL; + if (ndd->ns_current == -1 || ndd->ns_next == -1) /* pass */; else - return 0; + return max(num_labels, old_num_labels); nsindex = to_namespace_index(ndd, 0); memset(nsindex, 0, ndd->nsarea.config_size); @@ -582,7 +830,7 @@ static int init_labels(struct nd_mapping *nd_mapping) ndd->ns_next = 1; ndd->ns_current = 0; - return 0; + return max(num_labels, old_num_labels); } static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) @@ -604,22 +852,15 @@ static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) return 0; for_each_label(l, nd_label, nd_mapping->labels) { - int j; - memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) continue; slot = to_slot(ndd, nd_label); nd_label_free_slot(ndd, slot); dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); - for (j = l; nd_mapping->labels[j + 1]; j++) { - struct nd_namespace_label *next_label; - - next_label = nd_mapping->labels[j + 1]; - nd_mapping->labels[j] = next_label; - } - nd_mapping->labels[j] = NULL; + del_label(nd_mapping, l); num_freed++; + l--; /* retry with new label at this index */ } if (num_freed > l) { @@ -652,8 +893,8 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, continue; } - rc = init_labels(nd_mapping); - if (rc) + rc = init_labels(nd_mapping, 1); + if (rc < 0) return rc; rc = __pmem_label_update(nd_region, nd_mapping, nspm, i); @@ -663,3 +904,23 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, return 0; } + +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct resource *res; + int count = 0; + + if (size == 0) + return del_labels(nd_mapping, nsblk->uuid); + + for_each_dpa_resource(to_ndd(nd_mapping), res) + count++; + + count = init_labels(nd_mapping, count); + if (count < 0) + return count; + + return __blk_label_update(nd_region, nd_mapping, nsblk, count); +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index 6d376be31937..a59ef6eef2a3 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -128,9 +128,14 @@ void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); int nd_label_active_count(struct nvdimm_drvdata *ndd); struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd); +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot); u32 nd_label_nfree(struct nvdimm_drvdata *ndd); struct nd_region; struct nd_namespace_pmem; +struct nd_namespace_blk; int nd_pmem_namespace_label_update(struct nd_region *nd_region, struct nd_namespace_pmem *nspm, resource_size_t size); +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size); #endif /* __LABEL_H__ */ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 546e77e122fd..50b502b1908e 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -163,8 +163,7 @@ static int nd_namespace_label_update(struct nd_region *nd_region, */ if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - struct resource *res = &nspm->nsio.res; - resource_size_t size = resource_size(res); + resource_size_t size = resource_size(&nspm->nsio.res); if (size == 0 && nspm->uuid) /* delete allocation */; @@ -173,8 +172,15 @@ static int nd_namespace_label_update(struct nd_region *nd_region, return nd_pmem_namespace_label_update(nd_region, nspm, size); } else if (is_namespace_blk(dev)) { - /* TODO: implement blk labels */ - return 0; + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + resource_size_t size = nd_namespace_blk_size(nsblk); + + if (size == 0 && nsblk->uuid) + /* delete allocation */; + else if (!nsblk->uuid || !nsblk->lbasize) + return 0; + + return nd_blk_namespace_label_update(nd_region, nsblk, size); } else return -ENXIO; } @@ -986,6 +992,48 @@ static ssize_t sector_size_store(struct device *dev, } static DEVICE_ATTR_RW(sector_size); +static ssize_t dpa_extents_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_label_id label_id; + int count = 0, i; + u8 *uuid = NULL; + u32 flags = 0; + + nvdimm_bus_lock(dev); + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + flags = 0; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; + } + + if (!uuid) + goto out; + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + count++; + } + out: + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(dpa_extents); + static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, &dev_attr_size.attr, @@ -993,6 +1041,7 @@ static struct attribute *nd_namespace_attributes[] = { &dev_attr_resource.attr, &dev_attr_alt_name.attr, &dev_attr_sector_size.attr, + &dev_attr_dpa_extents.attr, NULL, }; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 22489555a6f1..78d6c51f4bac 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -75,5 +75,6 @@ struct nd_mapping; struct resource *nsblk_add_resource(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, resource_size_t start); +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); void get_ndd(struct nvdimm_drvdata *ndd); #endif /* __ND_CORE_H__ */ -- cgit v1.2.3-59-g8ed1b From 8c2f7e8658df1d3b7cbfa62706941d14c715823a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 25 Jun 2015 04:20:04 -0400 Subject: libnvdimm: infrastructure for btt devices NVDIMM namespaces, in addition to accepting "struct bio" based requests, also have the capability to perform byte-aligned accesses. By default only the bio/block interface is used. However, if another driver can make effective use of the byte-aligned capability it can claim namespace interface and use the byte-aligned ->rw_bytes() interface. The BTT driver is the initial first consumer of this mechanism to allow adding atomic sector update semantics to a pmem or blk namespace. This patch is the sysfs infrastructure to allow configuring a BTT instance for a namespace. Enabling that BTT and performing i/o is in a subsequent patch. Cc: Greg KH Cc: Neil Brown Signed-off-by: Dan Williams --- drivers/nvdimm/Kconfig | 3 + drivers/nvdimm/Makefile | 1 + drivers/nvdimm/btt.h | 45 +++++ drivers/nvdimm/btt_devs.c | 422 ++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/bus.c | 12 +- drivers/nvdimm/label.c | 5 +- drivers/nvdimm/namespace_devs.c | 204 +++++++++++++++---- drivers/nvdimm/nd-core.h | 4 + drivers/nvdimm/nd.h | 40 ++++ drivers/nvdimm/pmem.c | 132 ++++++++----- drivers/nvdimm/region.c | 8 +- drivers/nvdimm/region_devs.c | 39 +++- include/linux/nd.h | 63 +++++- 13 files changed, 879 insertions(+), 99 deletions(-) create mode 100644 drivers/nvdimm/btt.h create mode 100644 drivers/nvdimm/btt_devs.c (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 07a29113b870..5680e8e7a7aa 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -33,4 +33,7 @@ config BLK_DEV_PMEM Say Y if you want to use an NVDIMM +config BTT + def_bool y + endif diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index abce98f87f16..6085b4bd7312 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -11,3 +11,4 @@ libnvdimm-y += region_devs.o libnvdimm-y += region.o libnvdimm-y += namespace_devs.o libnvdimm-y += label.o +libnvdimm-$(CONFIG_BTT) += btt_devs.o diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h new file mode 100644 index 000000000000..e8f6d8e0ddd3 --- /dev/null +++ b/drivers/nvdimm/btt.h @@ -0,0 +1,45 @@ +/* + * Block Translation Table library + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_BTT_H +#define _LINUX_BTT_H + +#include + +#define BTT_SIG_LEN 16 +#define BTT_SIG "BTT_ARENA_INFO\0" + +struct btt_sb { + u8 signature[BTT_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le32 external_lbasize; + __le32 external_nlba; + __le32 internal_lbasize; + __le32 internal_nlba; + __le32 nfree; + __le32 infosize; + __le64 nextoff; + __le64 dataoff; + __le64 mapoff; + __le64 logoff; + __le64 info2off; + u8 padding[3968]; + __le64 checksum; +}; + +#endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c new file mode 100644 index 000000000000..effb70a88347 --- /dev/null +++ b/drivers/nvdimm/btt_devs.c @@ -0,0 +1,422 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include "nd-core.h" +#include "btt.h" +#include "nd.h" + +static void __nd_btt_detach_ndns(struct nd_btt *nd_btt) +{ + struct nd_namespace_common *ndns = nd_btt->ndns; + + dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex) + || ndns->claim != &nd_btt->dev, + "%s: invalid claim\n", __func__); + ndns->claim = NULL; + nd_btt->ndns = NULL; + put_device(&ndns->dev); +} + +static void nd_btt_detach_ndns(struct nd_btt *nd_btt) +{ + struct nd_namespace_common *ndns = nd_btt->ndns; + + if (!ndns) + return; + get_device(&ndns->dev); + device_lock(&ndns->dev); + __nd_btt_detach_ndns(nd_btt); + device_unlock(&ndns->dev); + put_device(&ndns->dev); +} + +static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns) +{ + if (ndns->claim) + return false; + dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex) + || nd_btt->ndns, + "%s: invalid claim\n", __func__); + ndns->claim = &nd_btt->dev; + nd_btt->ndns = ndns; + get_device(&ndns->dev); + return true; +} + +static bool nd_btt_attach_ndns(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns) +{ + bool claimed; + + device_lock(&ndns->dev); + claimed = __nd_btt_attach_ndns(nd_btt, ndns); + device_unlock(&ndns->dev); + return claimed; +} + +static void nd_btt_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + dev_dbg(dev, "%s\n", __func__); + nd_btt_detach_ndns(nd_btt); + ida_simple_remove(&nd_region->btt_ida, nd_btt->id); + kfree(nd_btt->uuid); + kfree(nd_btt); +} + +static struct device_type nd_btt_device_type = { + .name = "nd_btt", + .release = nd_btt_release, +}; + +bool is_nd_btt(struct device *dev) +{ + return dev->type == &nd_btt_device_type; +} +EXPORT_SYMBOL(is_nd_btt); + +struct nd_btt *to_nd_btt(struct device *dev) +{ + struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev); + + WARN_ON(!is_nd_btt(dev)); + return nd_btt; +} +EXPORT_SYMBOL(to_nd_btt); + +static const unsigned long btt_lbasize_supported[] = { 512, 4096, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize, + btt_lbasize_supported); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_btt->uuid) + return sprintf(buf, "%pUb\n", nd_btt->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_btt->ndns + ? dev_name(&nd_btt->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static int namespace_match(struct device *dev, void *data) +{ + char *name = data; + + return strcmp(name, dev_name(dev)) == 0; +} + +static bool is_nd_btt_idle(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver) + return false; + return true; +} + +static ssize_t __namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + struct nd_namespace_common *ndns; + struct device *found; + char *name; + + if (dev->driver) { + dev_dbg(dev, "%s: -EBUSY\n", __func__); + return -EBUSY; + } + + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + strim(name); + + if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0) + /* pass */; + else { + len = -EINVAL; + goto out; + } + + ndns = nd_btt->ndns; + if (strcmp(name, "") == 0) { + /* detach the namespace and destroy / reset the btt device */ + nd_btt_detach_ndns(nd_btt); + if (is_nd_btt_idle(dev)) + nd_device_unregister(dev, ND_ASYNC); + else { + nd_btt->lbasize = 0; + kfree(nd_btt->uuid); + nd_btt->uuid = NULL; + } + goto out; + } else if (ndns) { + dev_dbg(dev, "namespace already set to: %s\n", + dev_name(&ndns->dev)); + len = -EBUSY; + goto out; + } + + found = device_find_child(dev->parent, name, namespace_match); + if (!found) { + dev_dbg(dev, "'%s' not found under %s\n", name, + dev_name(dev->parent)); + len = -ENODEV; + goto out; + } + + ndns = to_ndns(found); + if (__nvdimm_namespace_capacity(ndns) < SZ_16M) { + dev_dbg(dev, "%s too small to host btt\n", name); + len = -ENXIO; + goto out_attach; + } + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev)); + if (!nd_btt_attach_ndns(nd_btt, ndns)) { + dev_dbg(dev, "%s already claimed\n", + dev_name(&ndns->dev)); + len = -EBUSY; + } + + out_attach: + put_device(&ndns->dev); /* from device_find_child */ + out: + kfree(name); + return len; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + ssize_t rc; + + nvdimm_bus_lock(dev); + device_lock(dev); + rc = __namespace_store(dev, attr, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static struct attribute *nd_btt_attributes[] = { + &dev_attr_sector_size.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + NULL, +}; + +static struct attribute_group nd_btt_attribute_group = { + .attrs = nd_btt_attributes, +}; + +static const struct attribute_group *nd_btt_attribute_groups[] = { + &nd_btt_attribute_group, + &nd_device_attribute_group, + NULL, +}; + +static struct device *__nd_btt_create(struct nd_region *nd_region, + unsigned long lbasize, u8 *uuid, + struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt; + struct device *dev; + + nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL); + if (!nd_btt) + return NULL; + + nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL); + if (nd_btt->id < 0) { + kfree(nd_btt); + return NULL; + } + + nd_btt->lbasize = lbasize; + if (uuid) + uuid = kmemdup(uuid, 16, GFP_KERNEL); + nd_btt->uuid = uuid; + dev = &nd_btt->dev; + dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id); + dev->parent = &nd_region->dev; + dev->type = &nd_btt_device_type; + dev->groups = nd_btt_attribute_groups; + device_initialize(&nd_btt->dev); + if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) { + dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n", + __func__, dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; +} + +struct device *nd_btt_create(struct nd_region *nd_region) +{ + struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL); + + if (dev) + __nd_device_register(dev); + return dev; +} + +/* + * nd_btt_sb_checksum: compute checksum for btt info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_btt_sb_checksum(struct btt_sb *btt_sb) +{ + u64 sum, sum_save; + + sum_save = btt_sb->checksum; + btt_sb->checksum = 0; + sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1); + btt_sb->checksum = sum_save; + return sum; +} +EXPORT_SYMBOL(nd_btt_sb_checksum); + +static int __nd_btt_probe(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns, struct btt_sb *btt_sb) +{ + u64 checksum; + + if (!btt_sb || !ndns || !nd_btt) + return -ENODEV; + + if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb))) + return -ENXIO; + + if (nvdimm_namespace_capacity(ndns) < SZ_16M) + return -ENXIO; + + if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0) + return -ENODEV; + + checksum = le64_to_cpu(btt_sb->checksum); + btt_sb->checksum = 0; + if (checksum != nd_btt_sb_checksum(btt_sb)) + return -ENODEV; + btt_sb->checksum = cpu_to_le64(checksum); + + nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); + nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL); + if (!nd_btt->uuid) + return -ENOMEM; + + __nd_device_register(&nd_btt->dev); + + return 0; +} + +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + int rc; + struct device *dev; + struct btt_sb *btt_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + nvdimm_bus_lock(&ndns->dev); + dev = __nd_btt_create(nd_region, 0, NULL, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!dev) + return -ENOMEM; + dev_set_drvdata(dev, drvdata); + btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL); + rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb); + kfree(btt_sb); + dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__, + rc == 0 ? dev_name(dev) : ""); + if (rc < 0) { + __nd_btt_detach_ndns(to_nd_btt(dev)); + put_device(dev); + } + + return rc; +} +EXPORT_SYMBOL(nd_btt_probe); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index ca802702440e..dd12f38397db 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -14,8 +14,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -103,6 +105,7 @@ static int nvdimm_bus_probe(struct device *dev) dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, dev_name(dev), rc); + if (rc != 0) module_put(provider); return rc; @@ -163,14 +166,19 @@ static void nd_async_device_unregister(void *d, async_cookie_t cookie) put_device(dev); } -void nd_device_register(struct device *dev) +void __nd_device_register(struct device *dev) { dev->bus = &nvdimm_bus_type; - device_initialize(dev); get_device(dev); async_schedule_domain(nd_async_device_register, dev, &nd_async_domain); } + +void nd_device_register(struct device *dev) +{ + device_initialize(dev); + __nd_device_register(dev); +} EXPORT_SYMBOL(nd_device_register); void nd_device_unregister(struct device *dev, enum nd_async_mode mode) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 34148003fc73..96526dcfdd37 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -666,7 +666,7 @@ static int __blk_label_update(struct nd_region *nd_region, /* don't allow updates that consume the last label */ if (nfree - alloc < 0 || nfree - alloc + victims < 1) { - dev_info(&nsblk->dev, "insufficient label space\n"); + dev_info(&nsblk->common.dev, "insufficient label space\n"); kfree(victim_map); return -ENOSPC; } @@ -762,7 +762,8 @@ static int __blk_label_update(struct nd_region *nd_region, continue; res = to_resource(ndd, nd_label); res->flags &= ~DPA_RESOURCE_ADJUSTED; - dev_vdbg(&nsblk->dev, "assign label[%d] slot: %d\n", l, slot); + dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", + l, slot); nd_mapping->labels[l++] = nd_label; } nd_mapping->labels[l] = NULL; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 50b502b1908e..2c50a0719f8d 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -102,7 +102,7 @@ static ssize_t __alt_name_store(struct device *dev, const char *buf, } else return -ENXIO; - if (dev->driver) + if (dev->driver || to_ndns(dev)->claim) return -EBUSY; input = kmemdup(buf, len + 1, GFP_KERNEL); @@ -133,7 +133,7 @@ out: static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) { - struct nd_region *nd_region = to_nd_region(nsblk->dev.parent); + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_label_id label_id; @@ -152,9 +152,9 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) static int nd_namespace_label_update(struct nd_region *nd_region, struct device *dev) { - dev_WARN_ONCE(dev, dev->driver, + dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim, "namespace must be idle during label update\n"); - if (dev->driver) + if (dev->driver || to_ndns(dev)->claim) return 0; /* @@ -666,7 +666,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) u8 *uuid = NULL; int rc, i; - if (dev->driver) + if (dev->driver || to_ndns(dev)->claim) return -EBUSY; if (is_namespace_pmem(dev)) { @@ -733,12 +733,16 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) nd_namespace_pmem_set_size(nd_region, nspm, val * nd_region->ndr_mappings); } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + /* * Try to delete the namespace if we deleted all of its - * allocation and this is not the seed device for the - * region. + * allocation, this is not the seed device for the + * region, and it is not actively claimed by a btt + * instance. */ - if (val == 0 && nd_region->ns_seed != dev) + if (val == 0 && nd_region->ns_seed != dev + && !nsblk->common.claim) nd_device_unregister(dev, ND_ASYNC); } @@ -789,26 +793,42 @@ static ssize_t size_store(struct device *dev, return rc < 0 ? rc : len; } -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns) { - unsigned long long size = 0; + struct device *dev = &ndns->dev; - nvdimm_bus_lock(dev); if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - size = resource_size(&nspm->nsio.res); + return resource_size(&nspm->nsio.res); } else if (is_namespace_blk(dev)) { - size = nd_namespace_blk_size(to_nd_namespace_blk(dev)); + return nd_namespace_blk_size(to_nd_namespace_blk(dev)); } else if (is_namespace_io(dev)) { struct nd_namespace_io *nsio = to_nd_namespace_io(dev); - size = resource_size(&nsio->res); - } - nvdimm_bus_unlock(dev); + return resource_size(&nsio->res); + } else + WARN_ONCE(1, "unknown namespace type\n"); + return 0; +} + +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + resource_size_t size; - return sprintf(buf, "%llu\n", size); + nvdimm_bus_lock(&ndns->dev); + size = __nvdimm_namespace_capacity(ndns); + nvdimm_bus_unlock(&ndns->dev); + + return size; +} +EXPORT_SYMBOL(nvdimm_namespace_capacity); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long) + nvdimm_namespace_capacity(to_ndns(dev))); } static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); @@ -897,8 +917,8 @@ static ssize_t uuid_store(struct device *dev, { struct nd_region *nd_region = to_nd_region(dev->parent); u8 *uuid = NULL; + ssize_t rc = 0; u8 **ns_uuid; - ssize_t rc; if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); @@ -914,7 +934,10 @@ static ssize_t uuid_store(struct device *dev, device_lock(dev); nvdimm_bus_lock(dev); wait_nvdimm_bus_probe_idle(dev); - rc = nd_uuid_store(dev, &uuid, buf, len); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_uuid_store(dev, &uuid, buf, len); if (rc >= 0) rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); if (rc >= 0) @@ -971,15 +994,18 @@ static ssize_t sector_size_store(struct device *dev, { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); struct nd_region *nd_region = to_nd_region(dev->parent); - ssize_t rc; + ssize_t rc = 0; if (!is_namespace_blk(dev)) return -ENXIO; device_lock(dev); nvdimm_bus_lock(dev); - rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, - ns_lbasize_supported); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, + ns_lbasize_supported); if (rc >= 0) rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, @@ -1034,12 +1060,48 @@ static ssize_t dpa_extents_show(struct device *dev, } static DEVICE_ATTR_RO(dpa_extents); +static ssize_t holder_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(holder); + +static ssize_t force_raw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool force_raw; + int rc = strtobool(buf, &force_raw); + + if (rc) + return rc; + + to_ndns(dev)->force_raw = force_raw; + return len; +} + +static ssize_t force_raw_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_ndns(dev)->force_raw); +} +static DEVICE_ATTR_RW(force_raw); + static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, &dev_attr_size.attr, &dev_attr_uuid.attr, + &dev_attr_holder.attr, &dev_attr_resource.attr, &dev_attr_alt_name.attr, + &dev_attr_force_raw.attr, &dev_attr_sector_size.attr, &dev_attr_dpa_extents.attr, NULL, @@ -1066,7 +1128,9 @@ static umode_t namespace_visible(struct kobject *kobj, return a->mode; } - if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr) + if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr + || a == &dev_attr_holder.attr + || a == &dev_attr_force_raw.attr) return a->mode; return 0; @@ -1083,6 +1147,66 @@ static const struct attribute_group *nd_namespace_attribute_groups[] = { NULL, }; +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) +{ + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_namespace_common *ndns; + resource_size_t size; + + if (nd_btt) { + ndns = nd_btt->ndns; + if (!ndns) + return ERR_PTR(-ENODEV); + + /* + * Flush any in-progess probes / removals in the driver + * for the raw personality of this namespace. + */ + device_lock(&ndns->dev); + device_unlock(&ndns->dev); + if (ndns->dev.driver) { + dev_dbg(&ndns->dev, "is active, can't bind %s\n", + dev_name(&nd_btt->dev)); + return ERR_PTR(-EBUSY); + } + if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev, + "host (%s) vs claim (%s) mismatch\n", + dev_name(&nd_btt->dev), + dev_name(ndns->claim))) + return ERR_PTR(-ENXIO); + } else { + ndns = to_ndns(dev); + if (ndns->claim) { + dev_dbg(dev, "claimed by %s, failing probe\n", + dev_name(ndns->claim)); + + return ERR_PTR(-ENXIO); + } + } + + size = nvdimm_namespace_capacity(ndns); + if (size < ND_MIN_NAMESPACE_SIZE) { + dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n", + &size, ND_MIN_NAMESPACE_SIZE); + return ERR_PTR(-ENODEV); + } + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (!nspm->uuid) { + dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__); + return ERR_PTR(-ENODEV); + } + } else if (is_namespace_blk(&ndns->dev)) { + return ERR_PTR(-ENODEV); /* TODO */ + } + + return ndns; +} +EXPORT_SYMBOL(nvdimm_namespace_common_probe); + static struct device **create_namespace_io(struct nd_region *nd_region) { struct nd_namespace_io *nsio; @@ -1099,7 +1223,7 @@ static struct device **create_namespace_io(struct nd_region *nd_region) return NULL; } - dev = &nsio->dev; + dev = &nsio->common.dev; dev->type = &namespace_io_device_type; dev->parent = &nd_region->dev; res = &nsio->res; @@ -1313,7 +1437,7 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region) if (!nspm) return NULL; - dev = &nspm->nsio.dev; + dev = &nspm->nsio.common.dev; dev->type = &namespace_pmem_device_type; dev->parent = &nd_region->dev; res = &nspm->nsio.res; @@ -1346,7 +1470,7 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region) return devs; err: - namespace_pmem_release(&nspm->nsio.dev); + namespace_pmem_release(&nspm->nsio.common.dev); return NULL; } @@ -1385,7 +1509,7 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region) if (!nsblk) return NULL; - dev = &nsblk->dev; + dev = &nsblk->common.dev; dev->type = &namespace_blk_device_type; nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); if (nsblk->id < 0) { @@ -1396,7 +1520,7 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region) dev->parent = &nd_region->dev; dev->groups = nd_namespace_attribute_groups; - return &nsblk->dev; + return &nsblk->common.dev; } void nd_region_create_blk_seed(struct nd_region *nd_region) @@ -1413,6 +1537,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region) nd_device_register(nd_region->ns_seed); } +void nd_region_create_btt_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->btt_seed = nd_btt_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->btt_seed) + dev_err(&nd_region->dev, "failed to create btt namespace\n"); +} + static struct device **create_namespace_blk(struct nd_region *nd_region) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; @@ -1446,7 +1582,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) if (!res) goto err; nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->dev)); + dev_name(&nsblk->common.dev)); break; } } @@ -1462,7 +1598,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); if (!nsblk) goto err; - dev = &nsblk->dev; + dev = &nsblk->common.dev; dev->type = &namespace_blk_device_type; dev->parent = &nd_region->dev; dev_set_name(dev, "namespace%d.%d", nd_region->id, count); @@ -1482,7 +1618,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) if (!res) goto err; nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->dev)); + dev_name(&nsblk->common.dev)); } dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", @@ -1503,7 +1639,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); if (!nsblk) goto err; - dev = &nsblk->dev; + dev = &nsblk->common.dev; dev->type = &namespace_blk_device_type; dev->parent = &nd_region->dev; devs[count++] = dev; @@ -1514,7 +1650,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) err: for (i = 0; i < count; i++) { nsblk = to_nd_namespace_blk(devs[i]); - namespace_blk_release(&nsblk->dev); + namespace_blk_release(&nsblk->common.dev); } kfree(devs); return NULL; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 78d6c51f4bac..5e6413964776 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -45,12 +45,14 @@ struct nvdimm { bool is_nvdimm(struct device *dev); bool is_nd_blk(struct device *dev); bool is_nd_pmem(struct device *dev); +struct nd_btt; struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_btt_seed(struct nd_region *nd_region); void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); @@ -58,6 +60,7 @@ void nd_synchronize(void); int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus); +void __nd_device_register(struct device *dev); int nd_match_dimm(struct device *dev, void *data); struct nd_label_id; char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags); @@ -77,4 +80,5 @@ struct resource *nsblk_add_resource(struct nd_region *nd_region, resource_size_t start); int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); void get_ndd(struct nvdimm_drvdata *ndd); +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index bfa849617358..d13eccbb67e9 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -19,6 +19,10 @@ #include #include "label.h" +enum { + SECTOR_SHIFT = 9, +}; + struct nvdimm_drvdata { struct device *dev; int nsindex_size; @@ -74,7 +78,9 @@ static inline struct nd_namespace_index *to_next_namespace_index( struct nd_region { struct device dev; struct ida ns_ida; + struct ida btt_ida; struct device *ns_seed; + struct device *btt_seed; u16 ndr_mappings; u64 ndr_size; u64 ndr_start; @@ -94,6 +100,14 @@ static inline unsigned nd_inc_seq(unsigned seq) return next[seq & 3]; } +struct nd_btt { + struct device dev; + struct nd_namespace_common *ndns; + unsigned long lbasize; + u8 *uuid; + int id; +}; + enum nd_async_mode { ND_SYNC, ND_ASYNC, @@ -118,6 +132,30 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, void *buf, size_t len); +struct nd_btt *to_nd_btt(struct device *dev); +struct btt_sb; +u64 nd_btt_sb_checksum(struct btt_sb *btt_sb); +#if IS_ENABLED(CONFIG_BTT) +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata); +bool is_nd_btt(struct device *dev); +struct device *nd_btt_create(struct nd_region *nd_region); +#else +static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + return -ENODEV; +} + +static inline bool is_nd_btt(struct device *dev) +{ + return false; +} + +static inline struct device *nd_btt_create(struct nd_region *nd_region) +{ + return NULL; +} + +#endif struct nd_region *to_nd_region(struct device *dev); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); @@ -132,4 +170,6 @@ void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res); struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id, resource_size_t start, resource_size_t n); +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 90902a142e35..d0c6b4bdba69 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -121,44 +121,61 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res, int id) { struct pmem_device *pmem; - struct gendisk *disk; - int err; - err = -ENOMEM; pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); if (!pmem) - goto out; + return ERR_PTR(-ENOMEM); pmem->phys_addr = res->start; pmem->size = resource_size(res); - err = -EINVAL; - if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { + if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) { dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); - goto out_free_dev; + kfree(pmem); + return ERR_PTR(-EBUSY); } /* * Map the memory as non-cachable, as we can't write back the contents * of the CPU caches in case of a crash. */ - err = -ENOMEM; pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); - if (!pmem->virt_addr) - goto out_release_region; + if (!pmem->virt_addr) { + release_mem_region(pmem->phys_addr, pmem->size); + kfree(pmem); + return ERR_PTR(-ENXIO); + } + + return pmem; +} + +static void pmem_detach_disk(struct pmem_device *pmem) +{ + del_gendisk(pmem->pmem_disk); + put_disk(pmem->pmem_disk); + blk_cleanup_queue(pmem->pmem_queue); +} + +static int pmem_attach_disk(struct nd_namespace_common *ndns, + struct pmem_device *pmem) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + struct gendisk *disk; pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); if (!pmem->pmem_queue) - goto out_unmap; + return -ENOMEM; blk_queue_make_request(pmem->pmem_queue, pmem_make_request); blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); disk = alloc_disk(0); - if (!disk) - goto out_free_queue; + if (!disk) { + blk_cleanup_queue(pmem->pmem_queue); + return -ENOMEM; + } disk->major = pmem_major; disk->first_minor = 0; @@ -166,32 +183,47 @@ static struct pmem_device *pmem_alloc(struct device *dev, disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", id); - disk->driverfs_dev = dev; + sprintf(disk->disk_name, "pmem%d", nd_region->id); + disk->driverfs_dev = &ndns->dev; set_capacity(disk, pmem->size >> 9); pmem->pmem_disk = disk; add_disk(disk); - return pmem; + return 0; +} -out_free_queue: - blk_cleanup_queue(pmem->pmem_queue); -out_unmap: - iounmap(pmem->virt_addr); -out_release_region: - release_mem_region(pmem->phys_addr, pmem->size); -out_free_dev: - kfree(pmem); -out: - return ERR_PTR(err); +static int pmem_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size, int rw) +{ + struct pmem_device *pmem = dev_get_drvdata(ndns->claim); + + if (unlikely(offset + size > pmem->size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (rw == READ) + memcpy(buf, pmem->virt_addr + offset, size); + else + memcpy(pmem->virt_addr + offset, buf, size); + + return 0; +} + +static int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + /* TODO */ + return -ENXIO; +} + +static void nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) +{ + /* TODO */ } static void pmem_free(struct pmem_device *pmem) { - del_gendisk(pmem->pmem_disk); - put_disk(pmem->pmem_disk); - blk_cleanup_queue(pmem->pmem_queue); iounmap(pmem->virt_addr); release_mem_region(pmem->phys_addr, pmem->size); kfree(pmem); @@ -200,40 +232,44 @@ static void pmem_free(struct pmem_device *pmem) static int nd_pmem_probe(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); - struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + struct nd_namespace_common *ndns; + struct nd_namespace_io *nsio; struct pmem_device *pmem; + int rc; - if (resource_size(&nsio->res) < ND_MIN_NAMESPACE_SIZE) { - resource_size_t size = resource_size(&nsio->res); - - dev_dbg(dev, "%s: size: %pa, too small must be at least %#x\n", - __func__, &size, ND_MIN_NAMESPACE_SIZE); - return -ENODEV; - } - - if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_PMEM) { - struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - - if (!nspm->uuid) { - dev_dbg(dev, "%s: uuid not set\n", __func__); - return -ENODEV; - } - } + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + nsio = to_nd_namespace_io(&ndns->dev); pmem = pmem_alloc(dev, &nsio->res, nd_region->id); if (IS_ERR(pmem)) return PTR_ERR(pmem); dev_set_drvdata(dev, pmem); - - return 0; + ndns->rw_bytes = pmem_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, pmem) == 0) { + /* we'll come back as btt-pmem */ + rc = -ENXIO; + } else + rc = pmem_attach_disk(ndns, pmem); + if (rc) + pmem_free(pmem); + return rc; } static int nd_pmem_remove(struct device *dev) { struct pmem_device *pmem = dev_get_drvdata(dev); + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + pmem_detach_disk(pmem); pmem_free(pmem); + return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 9aba44e483e0..2a5f3f53d79d 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -33,12 +33,13 @@ static int nd_region_probe(struct device *dev) num_ns->count = rc + err; dev_set_drvdata(dev, num_ns); + if (rc && err && rc == err) + return -ENODEV; + + nd_region->btt_seed = nd_btt_create(nd_region); if (err == 0) return 0; - if (rc == err) - return -ENODEV; - /* * Given multiple namespaces per region, we do not want to * disable all the successfully registered peer namespaces upon @@ -66,6 +67,7 @@ static int nd_region_remove(struct device *dev) /* flush attribute readers and disable */ nvdimm_bus_lock(dev); nd_region->ns_seed = NULL; + nd_region->btt_seed = NULL; dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ac21ce419beb..4fd92389fa7e 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -296,10 +296,28 @@ static ssize_t namespace_seed_show(struct device *dev, } static DEVICE_ATTR_RO(namespace_seed); +static ssize_t btt_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->btt_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(btt_seed); + static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, &dev_attr_nstype.attr, &dev_attr_mappings.attr, + &dev_attr_btt_seed.attr, &dev_attr_set_cookie.attr, &dev_attr_available_size.attr, &dev_attr_namespace_seed.attr, @@ -345,15 +363,18 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) /* * Upon successful probe/remove, take/release a reference on the - * associated interleave set (if present) + * associated interleave set (if present), and plant new btt + namespace + * seeds. */ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct device *dev, bool probe) { + struct nd_region *nd_region; + if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) { - struct nd_region *nd_region = to_nd_region(dev); int i; + nd_region = to_nd_region(dev); for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm_drvdata *ndd = nd_mapping->ndd; @@ -365,14 +386,21 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_mapping->ndd = NULL; atomic_dec(&nvdimm->busy); } - } else if (dev->parent && is_nd_blk(dev->parent) && probe) { - struct nd_region *nd_region = to_nd_region(dev->parent); - + } + if (dev->parent && is_nd_blk(dev->parent) && probe) { + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->ns_seed == dev) nd_region_create_blk_seed(nd_region); nvdimm_bus_unlock(dev); } + if (is_nd_btt(dev) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->btt_seed == dev) + nd_region_create_btt_seed(nd_region); + nvdimm_bus_unlock(dev); + } } void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) @@ -546,6 +574,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->provider_data = ndr_desc->provider_data; nd_region->nd_set = ndr_desc->nd_set; ida_init(&nd_region->ns_ida); + ida_init(&nd_region->btt_ida); dev = &nd_region->dev; dev_set_name(dev, "region%d", nd_region->id); dev->parent = &nvdimm_bus->dev; diff --git a/include/linux/nd.h b/include/linux/nd.h index 23276ea91690..507e47c86737 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -12,6 +12,7 @@ */ #ifndef __LINUX_ND_H__ #define __LINUX_ND_H__ +#include #include #include @@ -28,13 +29,33 @@ static inline struct nd_device_driver *to_nd_device_driver( return container_of(drv, struct nd_device_driver, drv); }; +/** + * struct nd_namespace_common - core infrastructure of a namespace + * @force_raw: ignore other personalities for the namespace (e.g. btt) + * @dev: device model node + * @claim: when set a another personality has taken ownership of the namespace + * @rw_bytes: access the raw namespace capacity with byte-aligned transfers + */ +struct nd_namespace_common { + int force_raw; + struct device dev; + struct device *claim; + int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, + void *buf, size_t size, int rw); +}; + +static inline struct nd_namespace_common *to_ndns(struct device *dev) +{ + return container_of(dev, struct nd_namespace_common, dev); +} + /** * struct nd_namespace_io - infrastructure for loading an nd_pmem instance * @dev: namespace device created by the nd region driver * @res: struct resource conversion of a NFIT SPA table */ struct nd_namespace_io { - struct device dev; + struct nd_namespace_common common; struct resource res; }; @@ -52,7 +73,6 @@ struct nd_namespace_pmem { /** * struct nd_namespace_blk - namespace for dimm-bounded persistent memory - * @dev: namespace device creation by the nd region driver * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label * @id: ida allocated id @@ -61,7 +81,7 @@ struct nd_namespace_pmem { * @res: discontiguous dpa extents for given dimm */ struct nd_namespace_blk { - struct device dev; + struct nd_namespace_common common; char *alt_name; u8 *uuid; int id; @@ -72,7 +92,7 @@ struct nd_namespace_blk { static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) { - return container_of(dev, struct nd_namespace_io, dev); + return container_of(dev, struct nd_namespace_io, common.dev); } static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) @@ -84,7 +104,40 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) { - return container_of(dev, struct nd_namespace_blk, dev); + return container_of(dev, struct nd_namespace_blk, common.dev); +} + +/** + * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to fill + * @size: transfer length + * + * @buf is up-to-date upon return from this routine. + */ +static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, READ); +} + +/** + * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to drain + * @size: transfer length + * + * NVDIMM Namepaces disks do not implement sectors internally. Depending on + * the @ndns, the contents of @buf may be in cpu cache, platform buffers, + * or on backing memory media upon return from this routine. Flushing + * to media is handled internal to the @ndns driver, if at all. + */ +static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, WRITE); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3-59-g8ed1b From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:20:32 -0400 Subject: nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Neil Brown Cc: Jeff Moyer Cc: Dave Chinner Cc: Greg KH [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- Documentation/nvdimm/btt.txt | 273 ++++++++ drivers/acpi/nfit.c | 1 + drivers/nvdimm/Kconfig | 28 +- drivers/nvdimm/Makefile | 3 + drivers/nvdimm/btt.c | 1371 +++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/btt.h | 141 ++++ drivers/nvdimm/btt_devs.c | 3 +- drivers/nvdimm/namespace_devs.c | 24 + drivers/nvdimm/nd.h | 22 +- drivers/nvdimm/pmem.c | 14 +- drivers/nvdimm/region.c | 12 + drivers/nvdimm/region_devs.c | 82 ++- include/linux/libnvdimm.h | 1 + 13 files changed, 1950 insertions(+), 25 deletions(-) create mode 100644 Documentation/nvdimm/btt.txt create mode 100644 drivers/nvdimm/btt.c (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt new file mode 100644 index 000000000000..95134d5ec4a0 --- /dev/null +++ b/Documentation/nvdimm/btt.txt @@ -0,0 +1,273 @@ +BTT - Block Translation Table +============================= + + +1. Introduction +--------------- + +Persistent memory based storage is able to perform IO at byte (or more +accurately, cache line) granularity. However, we often want to expose such +storage as traditional block devices. The block drivers for persistent memory +will do exactly this. However, they do not provide any atomicity guarantees. +Traditional SSDs typically provide protection against torn sectors in hardware, +using stored energy in capacitors to complete in-flight block writes, or perhaps +in firmware. We don't have this luxury with persistent memory - if a write is in +progress, and we experience a power failure, the block will contain a mix of old +and new data. Applications may not be prepared to handle such a scenario. + +The Block Translation Table (BTT) provides atomic sector update semantics for +persistent memory devices, so that applications that rely on sector writes not +being torn can continue to do so. The BTT manifests itself as a stacked block +device, and reserves a portion of the underlying storage for its metadata. At +the heart of it, is an indirection table that re-maps all the blocks on the +volume. It can be thought of as an extremely simple file system that only +provides atomic sector updates. + + +2. Static Layout +---------------- + +The underlying storage on which a BTT can be laid out is not limited in any way. +The BTT, however, splits the available space into chunks of up to 512 GiB, +called "Arenas". + +Each arena follows the same layout for its metadata, and all references in an +arena are internal to it (with the exception of one field that points to the +next arena). The following depicts the "On-disk" metadata layout: + + + Backing Store +-------> Arena ++---------------+ | +------------------+ +| | | | Arena info block | +| Arena 0 +---+ | 4K | +| 512G | +------------------+ +| | | | ++---------------+ | | +| | | | +| Arena 1 | | Data Blocks | +| 512G | | | +| | | | ++---------------+ | | +| . | | | +| . | | | +| . | | | +| | | | +| | | | ++---------------+ +------------------+ + | | + | BTT Map | + | | + | | + +------------------+ + | | + | BTT Flog | + | | + +------------------+ + | Info block copy | + | 4K | + +------------------+ + + +3. Theory of Operation +---------------------- + + +a. The BTT Map +-------------- + +The map is a simple lookup/indirection table that maps an LBA to an internal +block. Each map entry is 32 bits. The two most significant bits are special +flags, and the remaining form the internal block number. + +Bit Description +31 : TRIM flag - marks if the block was trimmed or discarded +30 : ERROR flag - marks an error block. Cleared on write. +29 - 0 : Mappings to internal 'postmap' blocks + + +Some of the terminology that will be subsequently used: + +External LBA : LBA as made visible to upper layers. +ABA : Arena Block Address - Block offset/number within an arena +Premap ABA : The block offset into an arena, which was decided upon by range + checking the External LBA +Postmap ABA : The block number in the "Data Blocks" area obtained after + indirection from the map +nfree : The number of free blocks that are maintained at any given time. + This is the number of concurrent writes that can happen to the + arena. + + +For example, after adding a BTT, we surface a disk of 1024G. We get a read for +the external LBA at 768G. This falls into the second arena, and of the 512G +worth of blocks that this arena contributes, this block is at 256G. Thus, the +premap ABA is 256G. We now refer to the map, and find out the mapping for block +'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64. + + +b. The BTT Flog +--------------- + +The BTT provides sector atomicity by making every write an "allocating write", +i.e. Every write goes to a "free" block. A running list of free blocks is +maintained in the form of the BTT flog. 'Flog' is a combination of the words +"free list" and "log". The flog contains 'nfree' entries, and an entry contains: + +lba : The premap ABA that is being written to +old_map : The old postmap ABA - after 'this' write completes, this will be a + free block. +new_map : The new postmap ABA. The map will up updated to reflect this + lba->postmap_aba mapping, but we log it here in case we have to + recover. +seq : Sequence number to mark which of the 2 sections of this flog entry is + valid/newest. It cycles between 01->10->11->01 (binary) under normal + operation, with 00 indicating an uninitialized state. +lba' : alternate lba entry +old_map': alternate old postmap entry +new_map': alternate new postmap entry +seq' : alternate sequence number. + +Each of the above fields is 32-bit, making one entry 16 bytes. Flog updates are +done such that for any entry being written, it: +a. overwrites the 'old' section in the entry based on sequence numbers +b. writes the new entry such that the sequence number is written last. + + +c. The concept of lanes +----------------------- + +While 'nfree' describes the number of concurrent IOs an arena can process +concurrently, 'nlanes' is the number of IOs the BTT device as a whole can +process. + nlanes = min(nfree, num_cpus) +A lane number is obtained at the start of any IO, and is used for indexing into +all the on-disk and in-memory data structures for the duration of the IO. It is +protected by a spinlock. + + +d. In-memory data structure: Read Tracking Table (RTT) +------------------------------------------------------ + +Consider a case where we have two threads, one doing reads and the other, +writes. We can hit a condition where the writer thread grabs a free block to do +a new IO, but the (slow) reader thread is still reading from it. In other words, +the reader consulted a map entry, and started reading the corresponding block. A +writer started writing to the same external LBA, and finished the write updating +the map for that external LBA to point to its new postmap ABA. At this point the +internal, postmap block that the reader is (still) reading has been inserted +into the list of free blocks. If another write comes in for the same LBA, it can +grab this free block, and start writing to it, causing the reader to read +incorrect data. To prevent this, we introduce the RTT. + +The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts +into rtt[lane_number], the postmap ABA it is reading, and clears it after the +read is complete. Every writer thread, after grabbing a free block, checks the +RTT for its presence. If the postmap free block is in the RTT, it waits till the +reader clears the RTT entry, and only then starts writing to it. + + +e. In-memory data structure: map locks +-------------------------------------- + +Consider a case where two writer threads are writing to the same LBA. There can +be a race in the following sequence of steps: + +free[lane] = map[premap_aba] +map[premap_aba] = postmap_aba + +Both threads can update their respective free[lane] with the same old, freed +postmap_aba. This has made the layout inconsistent by losing a free entry, and +at the same time, duplicating another free entry for two lanes. + +To solve this, we could have a single map lock (per arena) that has to be taken +before performing the above sequence, but we feel that could be too contentious. +Instead we use an array of (nfree) map_locks that is indexed by +(premap_aba modulo nfree). + + +f. Reconstruction from the Flog +------------------------------- + +On startup, we analyze the BTT flog to create our list of free blocks. We walk +through all the entries, and for each lane, of the set of two possible +'sections', we always look at the most recent one only (based on the sequence +number). The reconstruction rules/steps are simple: +- Read map[log_entry.lba]. +- If log_entry.new matches the map entry, then log_entry.old is free. +- If log_entry.new does not match the map entry, then log_entry.new is free. + (This case can only be caused by power-fails/unsafe shutdowns) + + +g. Summarizing - Read and Write flows +------------------------------------- + +Read: + +1. Convert external LBA to arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Read map to get the entry for this pre-map ABA +4. Enter post-map ABA into RTT[lane] +5. If TRIM flag set in map, return zeroes, and end IO (go to step 8) +6. If ERROR flag set in map, end IO with EIO (go to step 8) +7. Read data from this block +8. Remove post-map ABA entry from RTT[lane] +9. Release lane (and lane_lock) + +Write: + +1. Convert external LBA to Arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Use lane to index into in-memory free list and obtain a new block, next flog + index, next sequence number +4. Scan the RTT to check if free block is present, and spin/wait if it is. +5. Write data to this free block +6. Read map to get the existing post-map ABA entry for this pre-map ABA +7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num] +8. Write new post-map ABA into map. +9. Write old post-map entry into the free list +10. Calculate next sequence number and write into the free list entry +11. Release lane (and lane_lock) + + +4. Error Handling +================= + +An arena would be in an error state if any of the metadata is corrupted +irrecoverably, either due to a bug or a media error. The following conditions +indicate an error: +- Info block checksum does not match (and recovering from the copy also fails) +- All internal available blocks are not uniquely and entirely addressed by the + sum of mapped blocks and free blocks (from the BTT flog). +- Rebuilding free list from the flog reveals missing/duplicate/impossible + entries +- A map entry is out of bounds + +If any of these error conditions are encountered, the arena is put into a read +only state using a flag in the info block. + + +5. In-kernel usage +================== + +Any block driver that supports byte granularity IO to the storage may register +with the BTT. It will have to provide the rw_bytes interface in its +block_device_operations struct: + + int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw); + +It may register with the BTT after it adds its own gendisk, using btt_init: + + struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize, + u32 lbasize, u8 uuid[], int maxlane); + +note that maxlane is the maximum amount of concurrency the driver wishes to +allow the BTT to use. + +The BTT 'disk' appears as a stacked block device that grabs the underlying block +device in the O_EXCL mode. + +When the driver wishes to remove the backing disk, it should similarly call +btt_fini using the same struct btt* handle that was provided to it by btt_init. + + void btt_fini(struct btt *btt); + diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 35af6f7f0abd..fc38b49eff7d 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -902,6 +902,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, } else { nd_mapping->size = nfit_mem->bdw->capacity; nd_mapping->start = nfit_mem->bdw->start_address; + ndr_desc->num_lanes = nfit_mem->bdw->windows; blk_valid = 1; } diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 5680e8e7a7aa..204ee0796411 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -8,11 +8,11 @@ menuconfig LIBNVDIMM NFIT, or otherwise can discover NVDIMM resources, a libnvdimm bus is registered to advertise PMEM (persistent memory) namespaces (/dev/pmemX) and BLK (sliding mmio window(s)) - namespaces (/dev/ndX). A PMEM namespace refers to a memory - resource that may span multiple DIMMs and support DAX (see - CONFIG_DAX). A BLK namespace refers to an NVDIMM control - region which exposes an mmio register set for windowed - access mode to non-volatile memory. + namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a + memory resource that may span multiple DIMMs and support DAX + (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control + region which exposes an mmio register set for windowed access + mode to non-volatile memory. if LIBNVDIMM @@ -20,6 +20,7 @@ config BLK_DEV_PMEM tristate "PMEM: Persistent memory block device support" default LIBNVDIMM depends on HAS_IOMEM + select ND_BTT if BTT help Memory ranges for PMEM are described by either an NFIT (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a @@ -33,7 +34,22 @@ config BLK_DEV_PMEM Say Y if you want to use an NVDIMM +config ND_BTT + tristate + config BTT - def_bool y + bool "BTT: Block Translation Table (atomic sector updates)" + default y if LIBNVDIMM + help + The Block Translation Table (BTT) provides atomic sector + update semantics for persistent memory devices, so that + applications that rely on sector writes not being torn (a + guarantee that typical disks provide) can continue to do so. + The BTT manifests itself as an alternate personality for an + NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX, + ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys, + etc...). + + Select Y if unsure endif diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 6085b4bd7312..d2aab6c58492 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -1,8 +1,11 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o nd_pmem-y := pmem.o +nd_btt-y := btt.o + libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c new file mode 100644 index 000000000000..7ae38aac2c25 --- /dev/null +++ b/drivers/nvdimm/btt.c @@ -0,0 +1,1371 @@ +/* + * Block Translation Table + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btt.h" +#include "nd.h" + +enum log_ent_request { + LOG_NEW_ENT = 0, + LOG_OLD_ENT +}; + +static int btt_major; + +static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_read_bytes(ndns, offset, buf, n); +} + +static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_write_bytes(ndns, offset, buf, n); +} + +static int btt_info_write(struct arena_info *arena, struct btt_sb *super) +{ + int ret; + + ret = arena_write_bytes(arena, arena->info2off, super, + sizeof(struct btt_sb)); + if (ret) + return ret; + + return arena_write_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +static int btt_info_read(struct arena_info *arena, struct btt_sb *super) +{ + WARN_ON(!super); + return arena_read_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +/* + * 'raw' version of btt_map write + * Assumptions: + * mapping is in little-endian + * mapping contains 'E' and 'Z' flags as desired + */ +static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping) +{ + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE); +} + +static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, + u32 z_flag, u32 e_flag) +{ + u32 ze; + __le32 mapping_le; + + /* + * This 'mapping' is supposed to be just the LBA mapping, without + * any flags set, so strip the flag bits. + */ + mapping &= MAP_LBA_MASK; + + ze = (z_flag << 1) + e_flag; + switch (ze) { + case 0: + /* + * We want to set neither of the Z or E flags, and + * in the actual layout, this means setting the bit + * positions of both to '1' to indicate a 'normal' + * map entry + */ + mapping |= MAP_ENT_NORMAL; + break; + case 1: + mapping |= (1 << MAP_ERR_SHIFT); + break; + case 2: + mapping |= (1 << MAP_TRIM_SHIFT); + break; + default: + /* + * The case where Z and E are both sent in as '1' could be + * construed as a valid 'normal' case, but we decide not to, + * to avoid confusion + */ + WARN_ONCE(1, "Invalid use of Z and E flags\n"); + return -EIO; + } + + mapping_le = cpu_to_le32(mapping); + return __btt_map_write(arena, lba, mapping_le); +} + +static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + int *trim, int *error) +{ + int ret; + __le32 in; + u32 raw_mapping, postmap, ze, z_flag, e_flag; + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + + ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE); + if (ret) + return ret; + + raw_mapping = le32_to_cpu(in); + + z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT; + e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT; + ze = (z_flag << 1) + e_flag; + postmap = raw_mapping & MAP_LBA_MASK; + + /* Reuse the {z,e}_flag variables for *trim and *error */ + z_flag = 0; + e_flag = 0; + + switch (ze) { + case 0: + /* Initial state. Return postmap = premap */ + *mapping = lba; + break; + case 1: + *mapping = postmap; + e_flag = 1; + break; + case 2: + *mapping = postmap; + z_flag = 1; + break; + case 3: + *mapping = postmap; + break; + default: + return -EIO; + } + + if (trim) + *trim = z_flag; + if (error) + *error = e_flag; + + return ret; +} + +static int btt_log_read_pair(struct arena_info *arena, u32 lane, + struct log_entry *ent) +{ + WARN_ON(!ent); + return arena_read_bytes(arena, + arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, + 2 * LOG_ENT_SIZE); +} + +static struct dentry *debugfs_root; + +static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + int idx) +{ + char dirname[32]; + struct dentry *d; + + /* If for some reason, parent bttN was not created, exit */ + if (!parent) + return; + + snprintf(dirname, 32, "arena%d", idx); + d = debugfs_create_dir(dirname, parent); + if (IS_ERR_OR_NULL(d)) + return; + a->debugfs_dir = d; + + debugfs_create_x64("size", S_IRUGO, d, &a->size); + debugfs_create_x64("external_lba_start", S_IRUGO, d, + &a->external_lba_start); + debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); + debugfs_create_u32("internal_lbasize", S_IRUGO, d, + &a->internal_lbasize); + debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); + debugfs_create_u32("external_lbasize", S_IRUGO, d, + &a->external_lbasize); + debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); + debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); + debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); + debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); + debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); + debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); + debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); +} + +static void btt_debugfs_init(struct btt *btt) +{ + int i = 0; + struct arena_info *arena; + + btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), + debugfs_root); + if (IS_ERR_OR_NULL(btt->debugfs_dir)) + return; + + list_for_each_entry(arena, &btt->arena_list, list) { + arena_debugfs_init(arena, btt->debugfs_dir, i); + i++; + } +} + +/* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. + * It also updates the sequence number in this old entry to + * make it the 'new' one if the mark_flag is set. + * Finally, it returns which of the entries was the older one. + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +static int btt_log_get_old(struct log_entry *ent) +{ + int old; + + /* + * the first ever time this is seen, the entry goes into [0] + * the next time, the following logic works out to put this + * (next) entry into [1] + */ + if (ent[0].seq == 0) { + ent[0].seq = cpu_to_le32(1); + return 0; + } + + if (ent[0].seq == ent[1].seq) + return -EINVAL; + if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) + return -EINVAL; + + if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { + if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) + old = 0; + else + old = 1; + } else { + if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) + old = 1; + else + old = 0; + } + + return old; +} + +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +/* + * This function copies the desired (old/new) log entry into ent if + * it is not NULL. It returns the sub-slot number (0 or 1) + * where the desired log entry was found. Negative return values + * indicate errors. + */ +static int btt_log_read(struct arena_info *arena, u32 lane, + struct log_entry *ent, int old_flag) +{ + int ret; + int old_ent, ret_ent; + struct log_entry log[2]; + + ret = btt_log_read_pair(arena, lane, log); + if (ret) + return -EIO; + + old_ent = btt_log_get_old(log); + if (old_ent < 0 || old_ent > 1) { + dev_info(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", + old_ent, lane, log[0].seq, log[1].seq); + /* TODO set error state? */ + return -EIO; + } + + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) + memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); + + return ret_ent; +} + +/* + * This function commits a log entry to media + * It does _not_ prepare the freelist entry for the next write + * btt_flog_write is the wrapper for updating the freelist elements + */ +static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent) +{ + int ret; + /* + * Ignore the padding in log_entry for calculating log_half. + * The entry is 'committed' when we write the sequence number, + * and we want to ensure that that is the last thing written. + * We don't bother writing the padding as that would be extra + * media wear and write amplification + */ + unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; + u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); + void *src = ent; + + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half); + if (ret) + return ret; + + ns_off += log_half; + src += log_half; + return arena_write_bytes(arena, ns_off, src, log_half); +} + +static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, + struct log_entry *ent) +{ + int ret; + + ret = __btt_log_write(arena, lane, sub, ent); + if (ret) + return ret; + + /* prepare the next free entry */ + arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; + if (++(arena->freelist[lane].seq) == 4) + arena->freelist[lane].seq = 1; + arena->freelist[lane].block = le32_to_cpu(ent->old_map); + + return ret; +} + +/* + * This function initializes the BTT map to the initial state, which is + * all-zeroes, and indicates an identity mapping + */ +static int btt_map_init(struct arena_info *arena) +{ + int ret = -EINVAL; + void *zerobuf; + size_t offset = 0; + size_t chunk_size = SZ_2M; + size_t mapsize = arena->logoff - arena->mapoff; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + + while (mapsize) { + size_t size = min(mapsize, chunk_size); + + ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, + size); + if (ret) + goto free; + + offset += size; + mapsize -= size; + cond_resched(); + } + + free: + kfree(zerobuf); + return ret; +} + +/* + * This function initializes the BTT log with 'fake' entries pointing + * to the initial reserved set of blocks as being free + */ +static int btt_log_init(struct arena_info *arena) +{ + int ret; + u32 i; + struct log_entry log, zerolog; + + memset(&zerolog, 0, sizeof(zerolog)); + + for (i = 0; i < arena->nfree; i++) { + log.lba = cpu_to_le32(i); + log.old_map = cpu_to_le32(arena->external_nlba + i); + log.new_map = cpu_to_le32(arena->external_nlba + i); + log.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &log); + if (ret) + return ret; + ret = __btt_log_write(arena, i, 1, &zerolog); + if (ret) + return ret; + } + + return 0; +} + +static int btt_freelist_init(struct arena_info *arena) +{ + int old, new, ret; + u32 i, map_entry; + struct log_entry log_new, log_old; + + arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), + GFP_KERNEL); + if (!arena->freelist) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) { + old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT); + if (old < 0) + return old; + + new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); + if (new < 0) + return new; + + /* sub points to the next one to be overwritten */ + arena->freelist[i].sub = 1 - new; + arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); + arena->freelist[i].block = le32_to_cpu(log_new.old_map); + + /* This implies a newly created or untouched flog entry */ + if (log_new.old_map == log_new.new_map) + continue; + + /* Check if map recovery is needed */ + ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, + NULL, NULL); + if (ret) + return ret; + if ((le32_to_cpu(log_new.new_map) != map_entry) && + (le32_to_cpu(log_new.old_map) == map_entry)) { + /* + * Last transaction wrote the flog, but wasn't able + * to complete the map write. So fix up the map. + */ + ret = btt_map_write(arena, le32_to_cpu(log_new.lba), + le32_to_cpu(log_new.new_map), 0, 0); + if (ret) + return ret; + } + + } + + return 0; +} + +static int btt_rtt_init(struct arena_info *arena) +{ + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); + if (arena->rtt == NULL) + return -ENOMEM; + + return 0; +} + +static int btt_maplocks_init(struct arena_info *arena) +{ + u32 i; + + arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), + GFP_KERNEL); + if (!arena->map_locks) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) + spin_lock_init(&arena->map_locks[i].lock); + + return 0; +} + +static struct arena_info *alloc_arena(struct btt *btt, size_t size, + size_t start, size_t arena_off) +{ + struct arena_info *arena; + u64 logsize, mapsize, datasize; + u64 available = size; + + arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); + if (!arena) + return NULL; + arena->nd_btt = btt->nd_btt; + + if (!size) + return arena; + + arena->size = size; + arena->external_lba_start = start; + arena->external_lbasize = btt->lbasize; + arena->internal_lbasize = roundup(arena->external_lbasize, + INT_LBASIZE_ALIGNMENT); + arena->nfree = BTT_DEFAULT_NFREE; + arena->version_major = 1; + arena->version_minor = 1; + + if (available % BTT_PG_SIZE) + available -= (available % BTT_PG_SIZE); + + /* Two pages are reserved for the super block and its copy */ + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ + logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), + BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ + arena->internal_nlba = div_u64(available - BTT_PG_SIZE, + arena->internal_lbasize + MAP_ENT_SIZE); + arena->external_nlba = arena->internal_nlba - arena->nfree; + + mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); + datasize = available - mapsize; + + /* 'Absolute' values, relative to start of storage space */ + arena->infooff = arena_off; + arena->dataoff = arena->infooff + BTT_PG_SIZE; + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; + return arena; +} + +static void free_arenas(struct btt *btt) +{ + struct arena_info *arena, *next; + + list_for_each_entry_safe(arena, next, &btt->arena_list, list) { + list_del(&arena->list); + kfree(arena->rtt); + kfree(arena->map_locks); + kfree(arena->freelist); + debugfs_remove_recursive(arena->debugfs_dir); + kfree(arena); + } +} + +/* + * This function checks if the metadata layout is valid and error free + */ +static int arena_is_valid(struct arena_info *arena, struct btt_sb *super, + u8 *uuid, u32 lbasize) +{ + u64 checksum; + + if (memcmp(super->uuid, uuid, 16)) + return 0; + + checksum = le64_to_cpu(super->checksum); + super->checksum = 0; + if (checksum != nd_btt_sb_checksum(super)) + return 0; + super->checksum = cpu_to_le64(checksum); + + if (lbasize != le32_to_cpu(super->external_lbasize)) + return 0; + + /* TODO: figure out action for this */ + if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) + dev_info(to_dev(arena), "Found arena with an error flag\n"); + + return 1; +} + +/* + * This function reads an existing valid btt superblock and + * populates the corresponding arena_info struct + */ +static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, + u64 arena_off) +{ + arena->internal_nlba = le32_to_cpu(super->internal_nlba); + arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); + arena->external_nlba = le32_to_cpu(super->external_nlba); + arena->external_lbasize = le32_to_cpu(super->external_lbasize); + arena->nfree = le32_to_cpu(super->nfree); + arena->version_major = le16_to_cpu(super->version_major); + arena->version_minor = le16_to_cpu(super->version_minor); + + arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + + le64_to_cpu(super->nextoff)); + arena->infooff = arena_off; + arena->dataoff = arena_off + le64_to_cpu(super->dataoff); + arena->mapoff = arena_off + le64_to_cpu(super->mapoff); + arena->logoff = arena_off + le64_to_cpu(super->logoff); + arena->info2off = arena_off + le64_to_cpu(super->info2off); + + arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) : + (arena->info2off - arena->infooff + BTT_PG_SIZE); + + arena->flags = le32_to_cpu(super->flags); +} + +static int discover_arenas(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + struct btt_sb *super; + size_t remaining = btt->rawsize; + u64 cur_nlba = 0; + size_t cur_off = 0; + int num_arenas = 0; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return -ENOMEM; + + while (remaining) { + /* Alloc memory for arena */ + arena = alloc_arena(btt, 0, 0, 0); + if (!arena) { + ret = -ENOMEM; + goto out_super; + } + + arena->infooff = cur_off; + ret = btt_info_read(arena, super); + if (ret) + goto out; + + if (!arena_is_valid(arena, super, btt->nd_btt->uuid, + btt->lbasize)) { + if (remaining == btt->rawsize) { + btt->init_state = INIT_NOTFOUND; + dev_info(to_dev(arena), "No existing arenas\n"); + goto out; + } else { + dev_info(to_dev(arena), + "Found corrupted metadata!\n"); + ret = -ENODEV; + goto out; + } + } + + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + + ret = btt_freelist_init(arena); + if (ret) + goto out; + + ret = btt_rtt_init(arena); + if (ret) + goto out; + + ret = btt_maplocks_init(arena); + if (ret) + goto out; + + list_add_tail(&arena->list, &btt->arena_list); + + remaining -= arena->size; + cur_off += arena->size; + cur_nlba += arena->external_nlba; + num_arenas++; + + if (arena->nextoff == 0) + break; + } + btt->num_arenas = num_arenas; + btt->nlba = cur_nlba; + btt->init_state = INIT_READY; + + kfree(super); + return ret; + + out: + kfree(arena); + free_arenas(btt); + out_super: + kfree(super); + return ret; +} + +static int create_arenas(struct btt *btt) +{ + size_t remaining = btt->rawsize; + size_t cur_off = 0; + + while (remaining) { + struct arena_info *arena; + size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); + + remaining -= arena_size; + if (arena_size < ARENA_MIN_SIZE) + break; + + arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); + if (!arena) { + free_arenas(btt); + return -ENOMEM; + } + btt->nlba += arena->external_nlba; + if (remaining >= ARENA_MIN_SIZE) + arena->nextoff = arena->size; + else + arena->nextoff = 0; + cur_off += arena_size; + list_add_tail(&arena->list, &btt->arena_list); + } + + return 0; +} + +/* + * This function completes arena initialization by writing + * all the metadata. + * It is only called for an uninitialized arena when a write + * to that arena occurs for the first time. + */ +static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid) +{ + int ret; + struct btt_sb *super; + + ret = btt_map_init(arena); + if (ret) + return ret; + + ret = btt_log_init(arena); + if (ret) + return ret; + + super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); + if (!super) + return -ENOMEM; + + strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); + memcpy(super->uuid, uuid, 16); + super->flags = cpu_to_le32(arena->flags); + super->version_major = cpu_to_le16(arena->version_major); + super->version_minor = cpu_to_le16(arena->version_minor); + super->external_lbasize = cpu_to_le32(arena->external_lbasize); + super->external_nlba = cpu_to_le32(arena->external_nlba); + super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); + super->internal_nlba = cpu_to_le32(arena->internal_nlba); + super->nfree = cpu_to_le32(arena->nfree); + super->infosize = cpu_to_le32(sizeof(struct btt_sb)); + super->nextoff = cpu_to_le64(arena->nextoff); + /* + * Subtract arena->infooff (arena start) so numbers are relative + * to 'this' arena + */ + super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); + super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); + super->logoff = cpu_to_le64(arena->logoff - arena->infooff); + super->info2off = cpu_to_le64(arena->info2off - arena->infooff); + + super->flags = 0; + super->checksum = cpu_to_le64(nd_btt_sb_checksum(super)); + + ret = btt_info_write(arena, super); + + kfree(super); + return ret; +} + +/* + * This function completes the initialization for the BTT namespace + * such that it is ready to accept IOs + */ +static int btt_meta_init(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + + mutex_lock(&btt->init_lock); + list_for_each_entry(arena, &btt->arena_list, list) { + ret = btt_arena_write_layout(arena, btt->nd_btt->uuid); + if (ret) + goto unlock; + + ret = btt_freelist_init(arena); + if (ret) + goto unlock; + + ret = btt_rtt_init(arena); + if (ret) + goto unlock; + + ret = btt_maplocks_init(arena); + if (ret) + goto unlock; + } + + btt->init_state = INIT_READY; + + unlock: + mutex_unlock(&btt->init_lock); + return ret; +} + +/* + * This function calculates the arena in which the given LBA lies + * by doing a linear walk. This is acceptable since we expect only + * a few arenas. If we have backing devices that get much larger, + * we can construct a balanced binary tree of arenas at init time + * so that this range search becomes faster. + */ +static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, + struct arena_info **arena) +{ + struct arena_info *arena_list; + __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); + + list_for_each_entry(arena_list, &btt->arena_list, list) { + if (lba < arena_list->external_nlba) { + *arena = arena_list; + *premap = lba; + return 0; + } + lba -= arena_list->external_nlba; + } + + return -EIO; +} + +/* + * The following (lock_map, unlock_map) are mostly just to improve + * readability, since they index into an array of locks + */ +static void lock_map(struct arena_info *arena, u32 premap) + __acquires(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_lock(&arena->map_locks[idx].lock); +} + +static void unlock_map(struct arena_info *arena, u32 premap) + __releases(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_unlock(&arena->map_locks[idx].lock); +} + +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int btt_data_read(struct arena_info *arena, struct page *page, + unsigned int off, u32 lba, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_read_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static int btt_data_write(struct arena_info *arena, u32 lba, + struct page *page, unsigned int off, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_write_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static void zero_fill_data(struct page *page, unsigned int off, u32 len) +{ + void *mem = kmap_atomic(page); + + memset(mem + off, 0, len); + kunmap_atomic(mem); +} + +static int btt_read_pg(struct btt *btt, struct page *page, unsigned int off, + sector_t sector, unsigned int len) +{ + int ret = 0; + int t_flag, e_flag; + struct arena_info *arena = NULL; + u32 lane = 0, premap, postmap; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + + cur_len = min(btt->sector_size, len); + + ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag); + if (ret) + goto out_lane; + + /* + * We loop to make sure that the post map LBA didn't change + * from under us between writing the RTT and doing the actual + * read. + */ + while (1) { + u32 new_map; + + if (t_flag) { + zero_fill_data(page, off, cur_len); + goto out_lane; + } + + if (e_flag) { + ret = -EIO; + goto out_lane; + } + + arena->rtt[lane] = RTT_VALID | postmap; + /* + * Barrier to make sure this write is not reordered + * to do the verification map_read before the RTT store + */ + barrier(); + + ret = btt_map_read(arena, premap, &new_map, &t_flag, + &e_flag); + if (ret) + goto out_rtt; + + if (postmap == new_map) + break; + + postmap = new_map; + } + + ret = btt_data_read(arena, page, off, postmap, cur_len); + if (ret) + goto out_rtt; + + arena->rtt[lane] = RTT_INVALID; + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_rtt: + arena->rtt[lane] = RTT_INVALID; + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_write_pg(struct btt *btt, sector_t sector, struct page *page, + unsigned int off, unsigned int len) +{ + int ret = 0; + struct arena_info *arena = NULL; + u32 premap = 0, old_postmap, new_postmap, lane = 0, i; + struct log_entry log; + int sub; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + cur_len = min(btt->sector_size, len); + + if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { + ret = -EIO; + goto out_lane; + } + + new_postmap = arena->freelist[lane].block; + + /* Wait if the new block is being read from */ + for (i = 0; i < arena->nfree; i++) + while (arena->rtt[i] == (RTT_VALID | new_postmap)) + cpu_relax(); + + + if (new_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_lane; + } else + ret = btt_data_write(arena, new_postmap, page, + off, cur_len); + if (ret) + goto out_lane; + + lock_map(arena, premap); + ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL); + if (ret) + goto out_map; + if (old_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_map; + } + + log.lba = cpu_to_le32(premap); + log.old_map = cpu_to_le32(old_postmap); + log.new_map = cpu_to_le32(new_postmap); + log.seq = cpu_to_le32(arena->freelist[lane].seq); + sub = arena->freelist[lane].sub; + ret = btt_flog_write(arena, lane, sub, &log); + if (ret) + goto out_map; + + ret = btt_map_write(arena, premap, new_postmap, 0, 0); + if (ret) + goto out_map; + + unlock_map(arena, premap); + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_map: + unlock_map(arena, premap); + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_do_bvec(struct btt *btt, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + int ret; + + if (rw == READ) { + ret = btt_read_pg(btt, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + ret = btt_write_pg(btt, sector, page, off, len); + } + + return ret; +} + +static void btt_make_request(struct request_queue *q, struct bio *bio) +{ + struct btt *btt = q->queuedata; + struct bvec_iter iter; + struct bio_vec bvec; + int err = 0, rw; + + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + /* Make sure len is in multiples of sector size. */ + /* XXX is this right? */ + BUG_ON(len < btt->sector_size); + BUG_ON(len % btt->sector_size); + + err = btt_do_bvec(btt, bvec.bv_page, len, bvec.bv_offset, + rw, iter.bi_sector); + if (err) { + dev_info(&btt->nd_btt->dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + goto out; + } + } + +out: + bio_endio(bio, err); +} + +static int btt_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct btt *btt = bdev->bd_disk->private_data; + + btt_do_bvec(btt, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, 0); + return 0; +} + + +static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static const struct block_device_operations btt_fops = { + .owner = THIS_MODULE, + .rw_page = btt_rw_page, + .getgeo = btt_getgeo, +}; + +static int btt_blk_init(struct btt *btt) +{ + struct nd_btt *nd_btt = btt->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* create a new disk and request queue for btt */ + btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + if (!btt->btt_queue) + return -ENOMEM; + + btt->btt_disk = alloc_disk(0); + if (!btt->btt_disk) { + blk_cleanup_queue(btt->btt_queue); + return -ENOMEM; + } + + nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); + btt->btt_disk->driverfs_dev = &btt->nd_btt->dev; + btt->btt_disk->major = btt_major; + btt->btt_disk->first_minor = 0; + btt->btt_disk->fops = &btt_fops; + btt->btt_disk->private_data = btt; + btt->btt_disk->queue = btt->btt_queue; + btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + + blk_queue_make_request(btt->btt_queue, btt_make_request); + blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); + blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); + btt->btt_queue->queuedata = btt; + + set_capacity(btt->btt_disk, + btt->nlba * btt->sector_size >> SECTOR_SHIFT); + add_disk(btt->btt_disk); + + return 0; +} + +static void btt_blk_cleanup(struct btt *btt) +{ + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); +} + +/** + * btt_init - initialize a block translation table for the given device + * @nd_btt: device with BTT geometry and backing device info + * @rawsize: raw size in bytes of the backing device + * @lbasize: lba size of the backing device + * @uuid: A uuid for the backing device - this is stored on media + * @maxlane: maximum number of parallel requests the device can handle + * + * Initialize a Block Translation Table on a backing device to provide + * single sector power fail atomicity. + * + * Context: + * Might sleep. + * + * Returns: + * Pointer to a new struct btt on success, NULL on failure. + */ +static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, + u32 lbasize, u8 *uuid, struct nd_region *nd_region) +{ + int ret; + struct btt *btt; + struct device *dev = &nd_btt->dev; + + btt = kzalloc(sizeof(struct btt), GFP_KERNEL); + if (!btt) + return NULL; + + btt->nd_btt = nd_btt; + btt->rawsize = rawsize; + btt->lbasize = lbasize; + btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); + INIT_LIST_HEAD(&btt->arena_list); + mutex_init(&btt->init_lock); + btt->nd_region = nd_region; + + ret = discover_arenas(btt); + if (ret) { + dev_err(dev, "init: error in arena_discover: %d\n", ret); + goto out_free; + } + + if (btt->init_state != INIT_READY) { + btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); + dev_dbg(dev, "init: %d arenas for %llu rawsize\n", + btt->num_arenas, rawsize); + + ret = create_arenas(btt); + if (ret) { + dev_info(dev, "init: create_arenas: %d\n", ret); + goto out_free; + } + + ret = btt_meta_init(btt); + if (ret) { + dev_err(dev, "init: error in meta_init: %d\n", ret); + return NULL; + } + } + + ret = btt_blk_init(btt); + if (ret) { + dev_err(dev, "init: error in blk_init: %d\n", ret); + goto out_free; + } + + btt_debugfs_init(btt); + + return btt; + + out_free: + kfree(btt); + return NULL; +} + +/** + * btt_fini - de-initialize a BTT + * @btt: the BTT handle that was generated by btt_init + * + * De-initialize a Block Translation Table on device removal + * + * Context: + * Might sleep. + */ +static void btt_fini(struct btt *btt) +{ + if (btt) { + btt_blk_cleanup(btt); + free_arenas(btt); + debugfs_remove_recursive(btt->debugfs_dir); + kfree(btt); + } +} + +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct nd_region *nd_region; + struct btt *btt; + size_t rawsize; + + if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) + return -ENODEV; + + rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K; + if (rawsize < ARENA_MIN_SIZE) { + return -ENXIO; + } + nd_region = to_nd_region(nd_btt->dev.parent); + btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, + nd_region); + if (!btt) + return -ENOMEM; + nd_btt->btt = btt; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_attach_btt); + +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct btt *btt = nd_btt->btt; + + btt_fini(btt); + nd_btt->btt = NULL; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_detach_btt); + +static int __init nd_btt_init(void) +{ + int rc; + + BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K); + + btt_major = register_blkdev(0, "btt"); + if (btt_major < 0) + return btt_major; + + debugfs_root = debugfs_create_dir("btt", NULL); + if (IS_ERR_OR_NULL(debugfs_root)) { + rc = -ENXIO; + goto err_debugfs; + } + + return 0; + + err_debugfs: + unregister_blkdev(btt_major, "btt"); + + return rc; +} + +static void __exit nd_btt_exit(void) +{ + debugfs_remove_recursive(debugfs_root); + unregister_blkdev(btt_major, "btt"); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); +MODULE_AUTHOR("Vishal Verma "); +MODULE_LICENSE("GPL v2"); +module_init(nd_btt_init); +module_exit(nd_btt_exit); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index e8f6d8e0ddd3..8c95a7792c3e 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -19,6 +19,39 @@ #define BTT_SIG_LEN 16 #define BTT_SIG "BTT_ARENA_INFO\0" +#define MAP_ENT_SIZE 4 +#define MAP_TRIM_SHIFT 31 +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT) +#define MAP_ERR_SHIFT 30 +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) +#define MAP_ENT_NORMAL 0xC0000000 +#define LOG_ENT_SIZE sizeof(struct log_entry) +#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ +#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +#define RTT_VALID (1UL << 31) +#define RTT_INVALID 0 +#define INT_LBASIZE_ALIGNMENT 256 +#define BTT_PG_SIZE 4096 +#define BTT_DEFAULT_NFREE ND_MAX_LANES +#define LOG_SEQ_INIT 1 + +#define IB_FLAG_ERROR 0x00000001 +#define IB_FLAG_ERROR_MASK 0x00000001 + +enum btt_init_state { + INIT_UNCHECKED = 0, + INIT_NOTFOUND, + INIT_READY +}; + +struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; + __le64 padding[2]; +}; struct btt_sb { u8 signature[BTT_SIG_LEN]; @@ -42,4 +75,112 @@ struct btt_sb { __le64 checksum; }; +struct free_entry { + u32 block; + u8 sub; + u8 seq; +}; + +struct aligned_lock { + union { + spinlock_t lock; + u8 cacheline_padding[L1_CACHE_BYTES]; + }; +}; + +/** + * struct arena_info - handle for an arena + * @size: Size in bytes this arena occupies on the raw device. + * This includes arena metadata. + * @external_lba_start: The first external LBA in this arena. + * @internal_nlba: Number of internal blocks available in the arena + * including nfree reserved blocks + * @internal_lbasize: Internal and external lba sizes may be different as + * we can round up 'odd' external lbasizes such as 520B + * to be aligned. + * @external_nlba: Number of blocks contributed by the arena to the number + * reported to upper layers. (internal_nlba - nfree) + * @external_lbasize: LBA size as exposed to upper layers. + * @nfree: A reserve number of 'free' blocks that is used to + * handle incoming writes. + * @version_major: Metadata layout version major. + * @version_minor: Metadata layout version minor. + * @nextoff: Offset in bytes to the start of the next arena. + * @infooff: Offset in bytes to the info block of this arena. + * @dataoff: Offset in bytes to the data area of this arena. + * @mapoff: Offset in bytes to the map area of this arena. + * @logoff: Offset in bytes to the log area of this arena. + * @info2off: Offset in bytes to the backup info block of this arena. + * @freelist: Pointer to in-memory list of free blocks + * @rtt: Pointer to in-memory "Read Tracking Table" + * @map_locks: Spinlocks protecting concurrent map writes + * @nd_btt: Pointer to parent nd_btt structure. + * @list: List head for list of arenas + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. + */ +struct arena_info { + u64 size; /* Total bytes for this arena */ + u64 external_lba_start; + u32 internal_nlba; + u32 internal_lbasize; + u32 external_nlba; + u32 external_lbasize; + u32 nfree; + u16 version_major; + u16 version_minor; + /* Byte offsets to the different on-media structures */ + u64 nextoff; + u64 infooff; + u64 dataoff; + u64 mapoff; + u64 logoff; + u64 info2off; + /* Pointers to other in-memory structures for this arena */ + struct free_entry *freelist; + u32 *rtt; + struct aligned_lock *map_locks; + struct nd_btt *nd_btt; + struct list_head list; + struct dentry *debugfs_dir; + /* Arena flags */ + u32 flags; +}; + +/** + * struct btt - handle for a BTT instance + * @btt_disk: Pointer to the gendisk for BTT device + * @btt_queue: Pointer to the request queue for the BTT device + * @arena_list: Head of the list of arenas + * @debugfs_dir: Debugfs dentry + * @nd_btt: Parent nd_btt struct + * @nlba: Number of logical blocks exposed to the upper layers + * after removing the amount of space needed by metadata + * @rawsize: Total size in bytes of the available backing device + * @lbasize: LBA size as requested and presented to upper layers. + * This is sector_size + size of any metadata. + * @sector_size: The Linux sector size - 512 or 4096 + * @lanes: Per-lane spinlocks + * @init_lock: Mutex used for the BTT initialization + * @init_state: Flag describing the initialization state for the BTT + * @num_arenas: Number of arenas in the BTT instance + */ +struct btt { + struct gendisk *btt_disk; + struct request_queue *btt_queue; + struct list_head arena_list; + struct dentry *debugfs_dir; + struct nd_btt *nd_btt; + u64 nlba; + unsigned long long rawsize; + u32 lbasize; + u32 sector_size; + struct nd_region *nd_region; + struct mutex init_lock; + int init_state; + int num_arenas; +}; #endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index effb70a88347..470fbdccd0ac 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -348,7 +348,8 @@ struct device *nd_btt_create(struct nd_region *nd_region) */ u64 nd_btt_sb_checksum(struct btt_sb *btt_sb) { - u64 sum, sum_save; + u64 sum; + __le64 sum_save; sum_save = btt_sb->checksum; btt_sb->checksum = 0; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 2c50a0719f8d..4aa647c8d644 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -76,6 +76,30 @@ static bool is_namespace_io(struct device *dev) return dev ? dev->type == &namespace_io_device_type : false; } +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + const char *suffix = ""; + + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; + + if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) + sprintf(name, "pmem%d%s", nd_region->id, suffix); + else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix); + } else { + return NULL; + } + + return name; +} +EXPORT_SYMBOL(nvdimm_namespace_disk_name); + static ssize_t nstype_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index d13eccbb67e9..1b937c235913 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -20,6 +20,12 @@ #include "label.h" enum { + /* + * Limits the maximum number of block apertures a dimm can + * support and is an input to the geometry/on-disk-format of a + * BTT instance + */ + ND_MAX_LANES = 256, SECTOR_SHIFT = 9, }; @@ -75,6 +81,11 @@ static inline struct nd_namespace_index *to_next_namespace_index( for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ res; res = next, next = next ? next->sibling : NULL) +struct nd_percpu_lane { + int count; + spinlock_t lock; +}; + struct nd_region { struct device dev; struct ida ns_ida; @@ -84,9 +95,10 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id; + int id, num_lanes; void *provider_data; struct nd_interleave_set *nd_set; + struct nd_percpu_lane __percpu *lane; struct nd_mapping mapping[0]; }; @@ -100,9 +112,11 @@ static inline unsigned nd_inc_seq(unsigned seq) return next[seq & 3]; } +struct btt; struct nd_btt { struct device dev; struct nd_namespace_common *ndns; + struct btt *btt; unsigned long lbasize; u8 *uuid; int id; @@ -157,6 +171,8 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) #endif struct nd_region *to_nd_region(struct device *dev); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); @@ -172,4 +188,8 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, resource_size_t n); resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d0c6b4bdba69..7346054bccbb 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -160,7 +160,6 @@ static void pmem_detach_disk(struct pmem_device *pmem) static int pmem_attach_disk(struct nd_namespace_common *ndns, struct pmem_device *pmem) { - struct nd_region *nd_region = to_nd_region(ndns->dev.parent); struct gendisk *disk; pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); @@ -183,7 +182,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns, disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", nd_region->id); + nvdimm_namespace_disk_name(ndns, disk->disk_name); disk->driverfs_dev = &ndns->dev; set_capacity(disk, pmem->size >> 9); pmem->pmem_disk = disk; @@ -211,17 +210,6 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return 0; } -static int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) -{ - /* TODO */ - return -ENXIO; -} - -static void nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) -{ - /* TODO */ -} - static void pmem_free(struct pmem_device *pmem) { iounmap(pmem->virt_addr); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 2a5f3f53d79d..eb8aebcd4800 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -10,6 +10,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include #include #include #include @@ -18,10 +19,21 @@ static int nd_region_probe(struct device *dev) { int err; + static unsigned long once; struct nd_region_namespaces *num_ns; struct nd_region *nd_region = to_nd_region(dev); int rc = nd_region_register_namespaces(nd_region, &err); + if (nd_region->num_lanes > num_online_cpus() + && nd_region->num_lanes < num_possible_cpus() + && !test_and_set_bit(0, &once)) { + dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n", + num_online_cpus(), nd_region->num_lanes, + num_possible_cpus()); + dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n", + nd_region->num_lanes); + } + num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); if (!num_ns) return -ENOMEM; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4fd92389fa7e..fe8ec21fe3d7 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -32,6 +32,7 @@ static void nd_region_release(struct device *dev) put_device(&nvdimm->dev); } + free_percpu(nd_region->lane); ida_simple_remove(®ion_ida, nd_region->id); kfree(nd_region); } @@ -531,13 +532,66 @@ void *nd_region_provider_data(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(nd_region_provider_data); +/** + * nd_region_acquire_lane - allocate and lock a lane + * @nd_region: region id and number of lanes possible + * + * A lane correlates to a BLK-data-window and/or a log slot in the BTT. + * We optimize for the common case where there are 256 lanes, one + * per-cpu. For larger systems we need to lock to share lanes. For now + * this implementation assumes the cost of maintaining an allocator for + * free lanes is on the order of the lock hold time, so it implements a + * static lane = cpu % num_lanes mapping. + * + * In the case of a BTT instance on top of a BLK namespace a lane may be + * acquired recursively. We lock on the first instance. + * + * In the case of a BTT instance on top of PMEM, we only acquire a lane + * for the BTT metadata updates. + */ +unsigned int nd_region_acquire_lane(struct nd_region *nd_region) +{ + unsigned int cpu, lane; + + cpu = get_cpu(); + if (nd_region->num_lanes < nr_cpu_ids) { + struct nd_percpu_lane *ndl_lock, *ndl_count; + + lane = cpu % nd_region->num_lanes; + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (ndl_count->count++ == 0) + spin_lock(&ndl_lock->lock); + } else + lane = cpu; + + return lane; +} +EXPORT_SYMBOL(nd_region_acquire_lane); + +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) +{ + if (nd_region->num_lanes < nr_cpu_ids) { + unsigned int cpu = get_cpu(); + struct nd_percpu_lane *ndl_lock, *ndl_count; + + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (--ndl_count->count == 0) + spin_unlock(&ndl_lock->lock); + put_cpu(); + } + put_cpu(); +} +EXPORT_SYMBOL(nd_region_release_lane); + static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc, struct device_type *dev_type, const char *caller) { struct nd_region *nd_region; struct device *dev; - u16 i; + unsigned int i; for (i = 0; i < ndr_desc->num_mappings; i++) { struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; @@ -557,9 +611,19 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, if (!nd_region) return NULL; nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); - if (nd_region->id < 0) { - kfree(nd_region); - return NULL; + if (nd_region->id < 0) + goto err_id; + + nd_region->lane = alloc_percpu(struct nd_percpu_lane); + if (!nd_region->lane) + goto err_percpu; + + for (i = 0; i < nr_cpu_ids; i++) { + struct nd_percpu_lane *ndl; + + ndl = per_cpu_ptr(nd_region->lane, i); + spin_lock_init(&ndl->lock); + ndl->count = 0; } memcpy(nd_region->mapping, ndr_desc->nd_mapping, @@ -573,6 +637,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->ndr_mappings = ndr_desc->num_mappings; nd_region->provider_data = ndr_desc->provider_data; nd_region->nd_set = ndr_desc->nd_set; + nd_region->num_lanes = ndr_desc->num_lanes; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); dev = &nd_region->dev; @@ -585,11 +650,18 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_device_register(dev); return nd_region; + + err_percpu: + ida_simple_remove(®ion_ida, nd_region->id); + err_id: + kfree(nd_region); + return NULL; } struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc) { + ndr_desc->num_lanes = ND_MAX_LANES; return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, __func__); } @@ -600,6 +672,7 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, { if (ndr_desc->num_mappings > 1) return NULL; + ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES); return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type, __func__); } @@ -608,6 +681,7 @@ EXPORT_SYMBOL_GPL(nvdimm_blk_region_create); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc) { + ndr_desc->num_lanes = ND_MAX_LANES; return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, __func__); } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a59dca17b3aa..531d99dfac68 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -85,6 +85,7 @@ struct nd_region_desc { const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; void *provider_data; + int num_lanes; }; struct nvdimm_bus; -- cgit v1.2.3-59-g8ed1b From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 04:21:02 -0400 Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory The libnvdimm implementation handles allocating dimm address space (DPA) between PMEM and BLK mode interfaces. After DPA has been allocated from a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O as a struct bio based block device. Unlike PMEM, BLK is required to handle platform specific details like mmio register formats and memory controller interleave. For this reason the libnvdimm generic nd_blk driver calls back into the bus provider to carry out the I/O. This initial implementation handles the BLK interface defined by the ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from DCR (dimm control region), BDW (block data window), IDT (interleave descriptor) NFIT structures and the hardware register format. [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf [2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 449 ++++++++++++++++++++++++++++++++++++++-- drivers/acpi/nfit.h | 49 +++++ drivers/nvdimm/Kconfig | 13 ++ drivers/nvdimm/Makefile | 3 + drivers/nvdimm/blk.c | 245 ++++++++++++++++++++++ drivers/nvdimm/dimm_devs.c | 9 + drivers/nvdimm/namespace_devs.c | 65 +++++- drivers/nvdimm/nd-core.h | 3 +- drivers/nvdimm/nd.h | 13 +- drivers/nvdimm/region.c | 8 +- drivers/nvdimm/region_devs.c | 91 +++++++- include/linux/libnvdimm.h | 27 ++- 12 files changed, 941 insertions(+), 34 deletions(-) create mode 100644 drivers/nvdimm/blk.c (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index fc38b49eff7d..2aa6aa97b40c 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -13,12 +13,20 @@ #include #include #include +#include #include #include #include #include +#include #include "nfit.h" +/* + * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is + * irrelevant. + */ +#include + static bool force_enable_dimms; module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); @@ -72,7 +80,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, if (!adev) return -ENOTTY; - dimm_name = dev_name(&adev->dev); + dimm_name = nvdimm_name(nvdimm); cmd_name = nvdimm_cmd_name(cmd); dsm_mask = nfit_mem->dsm_mask; desc = nd_cmd_dimm_desc(cmd); @@ -279,6 +287,23 @@ static bool add_bdw(struct acpi_nfit_desc *acpi_desc, return true; } +static bool add_idt(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_interleave *idt) +{ + struct device *dev = acpi_desc->dev; + struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt), + GFP_KERNEL); + + if (!nfit_idt) + return false; + INIT_LIST_HEAD(&nfit_idt->list); + nfit_idt->idt = idt; + list_add_tail(&nfit_idt->list, &acpi_desc->idts); + dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__, + idt->interleave_index, idt->line_count); + return true; +} + static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, const void *end) { @@ -307,9 +332,9 @@ static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, if (!add_bdw(acpi_desc, table)) return err; break; - /* TODO */ case ACPI_NFIT_TYPE_INTERLEAVE: - dev_dbg(dev, "%s: idt\n", __func__); + if (!add_idt(acpi_desc, table)) + return err; break; case ACPI_NFIT_TYPE_FLUSH_ADDRESS: dev_dbg(dev, "%s: flush\n", __func__); @@ -362,8 +387,11 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa) { u16 dcr = __to_nfit_memdev(nfit_mem)->region_index; + struct nfit_memdev *nfit_memdev; struct nfit_dcr *nfit_dcr; struct nfit_bdw *nfit_bdw; + struct nfit_idt *nfit_idt; + u16 idt_idx, range_index; list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != dcr) @@ -396,6 +424,26 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, return 0; nfit_mem_find_spa_bdw(acpi_desc, nfit_mem); + + if (!nfit_mem->spa_bdw) + return 0; + + range_index = nfit_mem->spa_bdw->range_index; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + if (nfit_memdev->memdev->range_index != range_index || + nfit_memdev->memdev->region_index != dcr) + continue; + nfit_mem->memdev_bdw = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_bdw = nfit_idt->idt; + break; + } + break; + } + return 0; } @@ -439,9 +487,19 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, } if (type == NFIT_SPA_DCR) { + struct nfit_idt *nfit_idt; + u16 idt_idx; + /* multiple dimms may share a SPA when interleaved */ nfit_mem->spa_dcr = spa; nfit_mem->memdev_dcr = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_dcr = nfit_idt->idt; + break; + } } else { /* * A single dimm may belong to multiple SPA-PM @@ -871,6 +929,359 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, return 0; } +static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio) +{ + struct acpi_nfit_interleave *idt = mmio->idt; + u32 sub_line_offset, line_index, line_offset; + u64 line_no, table_skip_count, table_offset; + + line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset); + table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index); + line_offset = idt->line_offset[line_index] + * mmio->line_size; + table_offset = table_skip_count * mmio->table_size; + + return mmio->base_offset + line_offset + table_offset + sub_line_offset; +} + +static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + u64 offset = nfit_blk->stat_offset + mmio->size * bw; + + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + return readq(mmio->base + offset); +} + +static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, + resource_size_t dpa, unsigned int len, unsigned int write) +{ + u64 cmd, offset; + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + + enum { + BCW_OFFSET_MASK = (1ULL << 48)-1, + BCW_LEN_SHIFT = 48, + BCW_LEN_MASK = (1ULL << 8) - 1, + BCW_CMD_SHIFT = 56, + }; + + cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK; + len = len >> L1_CACHE_SHIFT; + cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT; + cmd |= ((u64) write) << BCW_CMD_SHIFT; + + offset = nfit_blk->cmd_offset + mmio->size * bw; + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + writeq(cmd, mmio->base + offset); + /* FIXME: conditionally perform read-back if mandated by firmware */ +} + +static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, + resource_size_t dpa, void *iobuf, size_t len, int rw, + unsigned int lane) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + unsigned int copied = 0; + u64 base_offset; + int rc; + + base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES + + lane * mmio->size; + /* TODO: non-temporal access, flush hints, cache management etc... */ + write_blk_ctl(nfit_blk, lane, dpa, len, rw); + while (len) { + unsigned int c; + u64 offset; + + if (mmio->num_lines) { + u32 line_offset; + + offset = to_interleave_offset(base_offset + copied, + mmio); + div_u64_rem(offset, mmio->line_size, &line_offset); + c = min_t(size_t, len, mmio->line_size - line_offset); + } else { + offset = base_offset + nfit_blk->bdw_offset; + c = len; + } + + if (rw) + memcpy(mmio->aperture + offset, iobuf + copied, c); + else + memcpy(iobuf + copied, mmio->aperture + offset, c); + + copied += c; + len -= c; + } + rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0; + return rc; +} + +static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr, + resource_size_t dpa, void *iobuf, u64 len, int rw) +{ + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + struct nd_region *nd_region = nfit_blk->nd_region; + unsigned int lane, copied = 0; + int rc = 0; + + lane = nd_region_acquire_lane(nd_region); + while (len) { + u64 c = min(len, mmio->size); + + rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied, + iobuf + copied, c, rw, lane); + if (rc) + break; + + copied += c; + len -= c; + } + nd_region_release_lane(nd_region, lane); + + return rc; +} + +static void nfit_spa_mapping_release(struct kref *kref) +{ + struct nfit_spa_mapping *spa_map = to_spa_map(kref); + struct acpi_nfit_system_address *spa = spa_map->spa; + struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index); + iounmap(spa_map->iomem); + release_mem_region(spa->address, spa->length); + list_del(&spa_map->list); + kfree(spa_map); +} + +static struct nfit_spa_mapping *find_spa_mapping( + struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + list_for_each_entry(spa_map, &acpi_desc->spa_maps, list) + if (spa_map->spa == spa) + return spa_map; + + return NULL; +} + +static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + mutex_lock(&acpi_desc->spa_map_mutex); + spa_map = find_spa_mapping(acpi_desc, spa); + + if (spa_map) + kref_put(&spa_map->kref, nfit_spa_mapping_release); + mutex_unlock(&acpi_desc->spa_map_mutex); +} + +static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + resource_size_t start = spa->address; + resource_size_t n = spa->length; + struct nfit_spa_mapping *spa_map; + struct resource *res; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + + spa_map = find_spa_mapping(acpi_desc, spa); + if (spa_map) { + kref_get(&spa_map->kref); + return spa_map->iomem; + } + + spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL); + if (!spa_map) + return NULL; + + INIT_LIST_HEAD(&spa_map->list); + spa_map->spa = spa; + kref_init(&spa_map->kref); + spa_map->acpi_desc = acpi_desc; + + res = request_mem_region(start, n, dev_name(acpi_desc->dev)); + if (!res) + goto err_mem; + + /* TODO: cacheability based on the spa type */ + spa_map->iomem = ioremap_nocache(start, n); + if (!spa_map->iomem) + goto err_map; + + list_add_tail(&spa_map->list, &acpi_desc->spa_maps); + return spa_map->iomem; + + err_map: + release_mem_region(start, n); + err_mem: + kfree(spa_map); + return NULL; +} + +/** + * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges + * @nvdimm_bus: NFIT-bus that provided the spa table entry + * @nfit_spa: spa table to map + * + * In the case where block-data-window apertures and + * dimm-control-regions are interleaved they will end up sharing a + * single request_mem_region() + ioremap() for the address range. In + * the style of devm nfit_spa_map() mappings are automatically dropped + * when all region devices referencing the same mapping are disabled / + * unbound. + */ +static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + void __iomem *iomem; + + mutex_lock(&acpi_desc->spa_map_mutex); + iomem = __nfit_spa_map(acpi_desc, spa); + mutex_unlock(&acpi_desc->spa_map_mutex); + + return iomem; +} + +static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio, + struct acpi_nfit_interleave *idt, u16 interleave_ways) +{ + if (idt) { + mmio->num_lines = idt->line_count; + mmio->line_size = idt->line_size; + if (interleave_ways == 0) + return -ENXIO; + mmio->table_size = mmio->num_lines * interleave_ways + * mmio->line_size; + } + + return 0; +} + +static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk_mmio *mmio; + struct nfit_blk *nfit_blk; + struct nfit_mem *nfit_mem; + struct nvdimm *nvdimm; + int rc; + + nvdimm = nd_blk_region_to_dimm(ndbr); + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) { + dev_dbg(dev, "%s: missing%s%s%s\n", __func__, + nfit_mem ? "" : " nfit_mem", + nfit_mem->dcr ? "" : " dcr", + nfit_mem->bdw ? "" : " bdw"); + return -ENXIO; + } + + nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL); + if (!nfit_blk) + return -ENOMEM; + nd_blk_region_set_provider_data(ndbr, nfit_blk); + nfit_blk->nd_region = to_nd_region(dev); + + /* map block aperture memory */ + nfit_blk->bdw_offset = nfit_mem->bdw->offset; + mmio = &nfit_blk->mmio[BDW]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->bdw->size; + mmio->base_offset = nfit_mem->memdev_bdw->region_offset; + mmio->idt = nfit_mem->idt_bdw; + mmio->spa = nfit_mem->spa_bdw; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw, + nfit_mem->memdev_bdw->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init bdw interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + /* map block control memory */ + nfit_blk->cmd_offset = nfit_mem->dcr->command_offset; + nfit_blk->stat_offset = nfit_mem->dcr->status_offset; + mmio = &nfit_blk->mmio[DCR]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map dcr\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->dcr->window_size; + mmio->base_offset = nfit_mem->memdev_dcr->region_offset; + mmio->idt = nfit_mem->idt_dcr; + mmio->spa = nfit_mem->spa_dcr; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr, + nfit_mem->memdev_dcr->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init dcr interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + if (mmio->line_size == 0) + return 0; + + if ((u32) nfit_blk->cmd_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "cmd_offset crosses interleave boundary\n"); + return -ENXIO; + } else if ((u32) nfit_blk->stat_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "stat_offset crosses interleave boundary\n"); + return -ENXIO; + } + + return 0; +} + +static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + int i; + + if (!nfit_blk) + return; /* never enabled */ + + /* auto-free BLK spa mappings */ + for (i = 0; i < 2; i++) { + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i]; + + if (mmio->base) + nfit_spa_unmap(acpi_desc, mmio->spa); + } + nd_blk_region_set_provider_data(ndbr, NULL); + /* devm will free nfit_blk */ +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -878,6 +1289,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, { struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, memdev->device_handle); + struct nd_blk_region_desc *ndbr_desc; struct nfit_mem *nfit_mem; int blk_valid = 0; @@ -908,6 +1320,10 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, ndr_desc->nd_mapping = nd_mapping; ndr_desc->num_mappings = blk_valid; + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr_desc->enable = acpi_nfit_blk_region_enable; + ndbr_desc->disable = acpi_nfit_blk_region_disable; + ndbr_desc->do_io = acpi_nfit_blk_region_do_io; if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc)) return -ENOMEM; break; @@ -921,8 +1337,9 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, { static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; + struct nd_blk_region_desc ndbr_desc; + struct nd_region_desc *ndr_desc; struct nfit_memdev *nfit_memdev; - struct nd_region_desc ndr_desc; struct nvdimm_bus *nvdimm_bus; struct resource res; int count = 0, rc; @@ -935,12 +1352,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, memset(&res, 0, sizeof(res)); memset(&nd_mappings, 0, sizeof(nd_mappings)); - memset(&ndr_desc, 0, sizeof(ndr_desc)); + memset(&ndbr_desc, 0, sizeof(ndbr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; - ndr_desc.res = &res; - ndr_desc.provider_data = nfit_spa; - ndr_desc.attr_groups = acpi_nfit_region_attribute_groups; + ndr_desc = &ndbr_desc.ndr_desc; + ndr_desc->res = &res; + ndr_desc->provider_data = nfit_spa; + ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping *nd_mapping; @@ -953,24 +1371,24 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, return -ENXIO; } nd_mapping = &nd_mappings[count++]; - rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, &ndr_desc, + rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, memdev, spa); if (rc) return rc; } - ndr_desc.nd_mapping = nd_mappings; - ndr_desc.num_mappings = count; - rc = acpi_nfit_init_interleave_set(acpi_desc, &ndr_desc, spa); + ndr_desc->nd_mapping = nd_mappings; + ndr_desc->num_mappings = count; + rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) return rc; nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { - if (!nvdimm_volatile_region_create(nvdimm_bus, &ndr_desc)) + if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } return 0; @@ -996,11 +1414,14 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) u8 *data; int rc; + INIT_LIST_HEAD(&acpi_desc->spa_maps); INIT_LIST_HEAD(&acpi_desc->spas); INIT_LIST_HEAD(&acpi_desc->dcrs); INIT_LIST_HEAD(&acpi_desc->bdws); + INIT_LIST_HEAD(&acpi_desc->idts); INIT_LIST_HEAD(&acpi_desc->memdevs); INIT_LIST_HEAD(&acpi_desc->dimms); + mutex_init(&acpi_desc->spa_map_mutex); data = (u8 *) acpi_desc->nfit; end = data + sz; diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index b76e33629098..7bd38b7baf39 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -52,6 +52,11 @@ struct nfit_bdw { struct list_head list; }; +struct nfit_idt { + struct acpi_nfit_interleave *idt; + struct list_head list; +}; + struct nfit_memdev { struct acpi_nfit_memory_map *memdev; struct list_head list; @@ -62,10 +67,13 @@ struct nfit_mem { struct nvdimm *nvdimm; struct acpi_nfit_memory_map *memdev_dcr; struct acpi_nfit_memory_map *memdev_pmem; + struct acpi_nfit_memory_map *memdev_bdw; struct acpi_nfit_control_region *dcr; struct acpi_nfit_data_region *bdw; struct acpi_nfit_system_address *spa_dcr; struct acpi_nfit_system_address *spa_bdw; + struct acpi_nfit_interleave *idt_dcr; + struct acpi_nfit_interleave *idt_bdw; struct list_head list; struct acpi_device *adev; unsigned long dsm_mask; @@ -74,16 +82,57 @@ struct nfit_mem { struct acpi_nfit_desc { struct nvdimm_bus_descriptor nd_desc; struct acpi_table_nfit *nfit; + struct mutex spa_map_mutex; + struct list_head spa_maps; struct list_head memdevs; struct list_head dimms; struct list_head spas; struct list_head dcrs; struct list_head bdws; + struct list_head idts; struct nvdimm_bus *nvdimm_bus; struct device *dev; unsigned long dimm_dsm_force_en; }; +enum nd_blk_mmio_selector { + BDW, + DCR, +}; + +struct nfit_blk { + struct nfit_blk_mmio { + union { + void __iomem *base; + void *aperture; + }; + u64 size; + u64 base_offset; + u32 line_size; + u32 num_lines; + u32 table_size; + struct acpi_nfit_interleave *idt; + struct acpi_nfit_system_address *spa; + } mmio[2]; + struct nd_region *nd_region; + u64 bdw_offset; /* post interleave offset */ + u64 stat_offset; + u64 cmd_offset; +}; + +struct nfit_spa_mapping { + struct acpi_nfit_desc *acpi_desc; + struct acpi_nfit_system_address *spa; + struct list_head list; + struct kref kref; + void __iomem *iomem; +}; + +static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref) +{ + return container_of(kref, struct nfit_spa_mapping, kref); +} + static inline struct acpi_nfit_memory_map *__to_nfit_memdev( struct nfit_mem *nfit_mem) { diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 204ee0796411..72226acb5c0f 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -34,6 +34,19 @@ config BLK_DEV_PMEM Say Y if you want to use an NVDIMM +config ND_BLK + tristate "BLK: Block data window (aperture) device support" + default LIBNVDIMM + select ND_BTT if BTT + help + Support NVDIMMs, or other devices, that implement a BLK-mode + access capability. BLK-mode access uses memory-mapped-i/o + apertures to access persistent media. + + Say Y if your platform firmware emits an ACPI.NFIT table + (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode + capabilities. + config ND_BTT tristate diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index d2aab6c58492..594bb97c867a 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -1,11 +1,14 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o nd_pmem-y := pmem.o nd_btt-y := btt.o +nd_blk-y := blk.o + libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c new file mode 100644 index 000000000000..9ac0c266c15c --- /dev/null +++ b/drivers/nvdimm/blk.c @@ -0,0 +1,245 @@ +/* + * NVDIMM Block Window Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nd.h" + +struct nd_blk_device { + struct request_queue *queue; + struct gendisk *disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + size_t disk_size; +}; + +static int nd_blk_major; + +static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, + resource_size_t ns_offset, unsigned int len) +{ + int i; + + for (i = 0; i < nsblk->num_resources; i++) { + if (ns_offset < resource_size(nsblk->res[i])) { + if (ns_offset + len > resource_size(nsblk->res[i])) { + dev_WARN_ONCE(&nsblk->common.dev, 1, + "illegal request\n"); + return SIZE_MAX; + } + return nsblk->res[i]->start + ns_offset; + } + ns_offset -= resource_size(nsblk->res[i]); + } + + dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n"); + return SIZE_MAX; +} + +static void nd_blk_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_device *blk_dev; + struct nd_blk_region *ndbr; + struct bvec_iter iter; + struct bio_vec bvec; + int err = 0, rw; + + blk_dev = disk->private_data; + nsblk = blk_dev->nsblk; + ndbr = blk_dev->ndbr; + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + resource_size_t dev_offset; + void *iobuf; + + BUG_ON(len > PAGE_SIZE); + + dev_offset = to_dev_offset(nsblk, + iter.bi_sector << SECTOR_SHIFT, len); + if (dev_offset == SIZE_MAX) { + err = -EIO; + goto out; + } + + iobuf = kmap_atomic(bvec.bv_page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + bvec.bv_offset, + len, rw); + kunmap_atomic(iobuf); + if (err) + goto out; + } + + out: + bio_endio(bio, err); +} + +static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *iobuf, size_t n, int rw) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim); + struct nd_namespace_blk *nsblk = blk_dev->nsblk; + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset; + + dev_offset = to_dev_offset(nsblk, offset, n); + + if (unlikely(offset + n > blk_dev->disk_size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (dev_offset == SIZE_MAX) + return -EIO; + + return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw); +} + +static const struct block_device_operations nd_blk_fops = { + .owner = THIS_MODULE, +}; + +static int nd_blk_attach_disk(struct nd_namespace_common *ndns, + struct nd_blk_device *blk_dev) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev); + struct gendisk *disk; + + blk_dev->queue = blk_alloc_queue(GFP_KERNEL); + if (!blk_dev->queue) + return -ENOMEM; + + blk_queue_make_request(blk_dev->queue, nd_blk_make_request); + blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX); + blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY); + blk_queue_logical_block_size(blk_dev->queue, nsblk->lbasize); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue); + + disk = blk_dev->disk = alloc_disk(0); + if (!disk) { + blk_cleanup_queue(blk_dev->queue); + return -ENOMEM; + } + + disk->driverfs_dev = &ndns->dev; + disk->major = nd_blk_major; + disk->first_minor = 0; + disk->fops = &nd_blk_fops; + disk->private_data = blk_dev; + disk->queue = blk_dev->queue; + disk->flags = GENHD_FL_EXT_DEVT; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + set_capacity(disk, blk_dev->disk_size >> SECTOR_SHIFT); + add_disk(disk); + + return 0; +} + +static int nd_blk_probe(struct device *dev) +{ + struct nd_namespace_common *ndns; + struct nd_blk_device *blk_dev; + int rc; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL); + if (!blk_dev) + return -ENOMEM; + + blk_dev->disk_size = nvdimm_namespace_capacity(ndns); + blk_dev->ndbr = to_nd_blk_region(dev->parent); + blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev); + dev_set_drvdata(dev, blk_dev); + + ndns->rw_bytes = nd_blk_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, blk_dev) == 0) { + /* we'll come back as btt-blk */ + rc = -ENXIO; + } else + rc = nd_blk_attach_disk(ndns, blk_dev); + if (rc) + kfree(blk_dev); + return rc; +} + +static void nd_blk_detach_disk(struct nd_blk_device *blk_dev) +{ + del_gendisk(blk_dev->disk); + put_disk(blk_dev->disk); + blk_cleanup_queue(blk_dev->queue); +} + +static int nd_blk_remove(struct device *dev) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + nd_blk_detach_disk(blk_dev); + kfree(blk_dev); + + return 0; +} + +static struct nd_device_driver nd_blk_driver = { + .probe = nd_blk_probe, + .remove = nd_blk_remove, + .drv = { + .name = "nd_blk", + }, + .type = ND_DRIVER_NAMESPACE_BLK, +}; + +static int __init nd_blk_init(void) +{ + int rc; + + rc = register_blkdev(0, "nd_blk"); + if (rc < 0) + return rc; + + nd_blk_major = rc; + rc = nd_driver_register(&nd_blk_driver); + + if (rc < 0) + unregister_blkdev(nd_blk_major, "nd_blk"); + + return rc; +} + +static void __exit nd_blk_exit(void) +{ + driver_unregister(&nd_blk_driver.drv); + unregister_blkdev(nd_blk_major, "nd_blk"); +} + +MODULE_AUTHOR("Ross Zwisler "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK); +module_init(nd_blk_init); +module_exit(nd_blk_exit); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 83b179ed6d61..c05eb807d674 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -209,6 +209,15 @@ struct nvdimm *to_nvdimm(struct device *dev) } EXPORT_SYMBOL_GPL(to_nvdimm); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr) +{ + struct nd_region *nd_region = &ndbr->nd_region; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + return nd_mapping->nvdimm; +} +EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm); + struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) { struct nvdimm *nvdimm = nd_mapping->nvdimm; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 4aa647c8d644..1ce1e70de44a 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -173,6 +173,65 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) return size; } +static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + int count, i; + + if (!nsblk->uuid || !nsblk->lbasize || !ndd) + return false; + + count = 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + /* + * Resources with unacknoweldged adjustments indicate a + * failure to update labels + */ + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + count++; + } + + /* These values match after a successful label update */ + if (count != nsblk->num_resources) + return false; + + for (i = 0; i < nsblk->num_resources; i++) { + struct resource *found = NULL; + + for_each_dpa_resource(ndd, res) + if (res == nsblk->res[i]) { + found = res; + break; + } + /* stale resource */ + if (!found) + return false; + } + + return true; +} + +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + resource_size_t size; + + nvdimm_bus_lock(&nsblk->common.dev); + size = __nd_namespace_blk_validate(nsblk); + nvdimm_bus_unlock(&nsblk->common.dev); + + return size; +} +EXPORT_SYMBOL(nd_namespace_blk_validate); + + static int nd_namespace_label_update(struct nd_region *nd_region, struct device *dev) { @@ -1224,7 +1283,11 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) return ERR_PTR(-ENODEV); } } else if (is_namespace_blk(&ndns->dev)) { - return ERR_PTR(-ENODEV); /* TODO */ + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + if (!nd_namespace_blk_validate(nsblk)) + return ERR_PTR(-ENODEV); } return ndns; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 5e6413964776..e1970c71ad1c 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -43,9 +43,8 @@ struct nvdimm { }; bool is_nvdimm(struct device *dev); -bool is_nd_blk(struct device *dev); bool is_nd_pmem(struct device *dev); -struct nd_btt; +bool is_nd_blk(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 1b937c235913..f153f43ca3d6 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -102,6 +102,15 @@ struct nd_region { struct nd_mapping mapping[0]; }; +struct nd_blk_region { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + void *blk_provider_data; + struct nd_region nd_region; +}; + /* * Lookup next in the repeating sequence of 01, 10, and 11. */ @@ -171,8 +180,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) #endif struct nd_region *to_nd_region(struct device *dev); -unsigned int nd_region_acquire_lane(struct nd_region *nd_region); -void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); @@ -192,4 +199,6 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +int nd_blk_region_init(struct nd_region *nd_region); +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index eb8aebcd4800..f28f78ccff19 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -18,11 +18,10 @@ static int nd_region_probe(struct device *dev) { - int err; + int err, rc; static unsigned long once; struct nd_region_namespaces *num_ns; struct nd_region *nd_region = to_nd_region(dev); - int rc = nd_region_register_namespaces(nd_region, &err); if (nd_region->num_lanes > num_online_cpus() && nd_region->num_lanes < num_possible_cpus() @@ -34,6 +33,11 @@ static int nd_region_probe(struct device *dev) nd_region->num_lanes); } + rc = nd_blk_region_init(nd_region); + if (rc) + return rc; + + rc = nd_region_register_namespaces(nd_region, &err); num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); if (!num_ns) return -ENOMEM; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index fe8ec21fe3d7..2cfb3f74bcbf 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -34,7 +35,10 @@ static void nd_region_release(struct device *dev) } free_percpu(nd_region->lane); ida_simple_remove(®ion_ida, nd_region->id); - kfree(nd_region); + if (is_nd_blk(dev)) + kfree(to_nd_blk_region(dev)); + else + kfree(nd_region); } static struct device_type nd_blk_device_type = { @@ -71,6 +75,33 @@ struct nd_region *to_nd_region(struct device *dev) } EXPORT_SYMBOL_GPL(to_nd_region); +struct nd_blk_region *to_nd_blk_region(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + WARN_ON(!is_nd_blk(dev)); + return container_of(nd_region, struct nd_blk_region, nd_region); +} +EXPORT_SYMBOL_GPL(to_nd_blk_region); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr) +{ + return ndbr->blk_provider_data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_provider_data); + +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data) +{ + ndbr->blk_provider_data = data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); + /** * nd_region_to_nstype() - region to an integer namespace type * @nd_region: region-device to interrogate @@ -365,7 +396,8 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present), and plant new btt + namespace - * seeds. + * seeds. Also, on the removal of a BLK region, notify the provider to + * disable the region. */ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct device *dev, bool probe) @@ -385,8 +417,14 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_mapping->labels = NULL; put_ndd(ndd); nd_mapping->ndd = NULL; - atomic_dec(&nvdimm->busy); + if (ndd) + atomic_dec(&nvdimm->busy); } + + if (is_nd_pmem(dev)) + return; + + to_nd_blk_region(dev)->disable(nvdimm_bus, dev); } if (dev->parent && is_nd_blk(dev->parent) && probe) { nd_region = to_nd_region(dev->parent); @@ -526,11 +564,21 @@ struct attribute_group nd_mapping_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_mapping_attribute_group); -void *nd_region_provider_data(struct nd_region *nd_region) +int nd_blk_region_init(struct nd_region *nd_region) { - return nd_region->provider_data; + struct device *dev = &nd_region->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!is_nd_blk(dev)) + return 0; + + if (nd_region->ndr_mappings < 1) { + dev_err(dev, "invalid BLK region\n"); + return -ENXIO; + } + + return to_nd_blk_region(dev)->enable(nvdimm_bus, dev); } -EXPORT_SYMBOL_GPL(nd_region_provider_data); /** * nd_region_acquire_lane - allocate and lock a lane @@ -591,6 +639,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, { struct nd_region *nd_region; struct device *dev; + void *region_buf; unsigned int i; for (i = 0; i < ndr_desc->num_mappings; i++) { @@ -605,10 +654,30 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, } } - nd_region = kzalloc(sizeof(struct nd_region) - + sizeof(struct nd_mapping) * ndr_desc->num_mappings, - GFP_KERNEL); - if (!nd_region) + if (dev_type == &nd_blk_device_type) { + struct nd_blk_region_desc *ndbr_desc; + struct nd_blk_region *ndbr; + + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + if (ndbr) { + nd_region = &ndbr->nd_region; + ndbr->enable = ndbr_desc->enable; + ndbr->disable = ndbr_desc->disable; + ndbr->do_io = ndbr_desc->do_io; + } + region_buf = ndbr; + } else { + nd_region = kzalloc(sizeof(struct nd_region) + + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + region_buf = nd_region; + } + + if (!region_buf) return NULL; nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); if (nd_region->id < 0) @@ -654,7 +723,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, err_percpu: ida_simple_remove(®ion_ida, nd_region->id); err_id: - kfree(nd_region); + kfree(region_buf); return NULL; } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 531d99dfac68..7fc1b25bdb5d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,7 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include #include #include @@ -89,8 +90,24 @@ struct nd_region_desc { }; struct nvdimm_bus; -struct device; struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ @@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); -void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3-59-g8ed1b From fcae695737fca0849c18db814d9d8de05c0fd2a2 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:22:39 -0400 Subject: libnvdimm, blk: add support for blk integrity Support multiple block sizes (sector + metadata) for nd_blk in the same way as done for the BTT. Add the idea of an 'internal' lbasize, which is properly aligned and padded, and store metadata in this space. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/blk.c | 174 +++++++++++++++++++++++++++++++++++----- drivers/nvdimm/btt.h | 1 - drivers/nvdimm/core.c | 3 + drivers/nvdimm/namespace_devs.c | 3 +- drivers/nvdimm/nd.h | 1 + 5 files changed, 159 insertions(+), 23 deletions(-) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 9ac0c266c15c..5c44e067652f 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -27,10 +27,17 @@ struct nd_blk_device { struct nd_namespace_blk *nsblk; struct nd_blk_region *ndbr; size_t disk_size; + u32 sector_size; + u32 internal_lbasize; }; static int nd_blk_major; +static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev) +{ + return blk_dev->nsblk->lbasize - blk_dev->sector_size; +} + static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, resource_size_t ns_offset, unsigned int len) { @@ -52,41 +59,145 @@ static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, return SIZE_MAX; } +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + unsigned int len = nd_blk_meta_size(blk_dev); + resource_size_t dev_offset, ns_offset; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + int err = 0; + + nsblk = blk_dev->nsblk; + ndbr = blk_dev->ndbr; + ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size; + dev_offset = to_dev_offset(nsblk, ns_offset, len); + if (dev_offset == SIZE_MAX) + return -EIO; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *iobuf; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + iobuf = kmap_atomic(bv.bv_page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset, + cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + len -= cur_len; + dev_offset += cur_len; + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); + } + + return err; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + return 0; +} +#endif + +static int nd_blk_do_bvec(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset, ns_offset; + int err = 0; + void *iobuf; + u64 lba; + + while (len) { + unsigned int cur_len; + + /* + * If we don't have an integrity payload, we don't have to + * split the bvec into sectors, as this would cause unnecessary + * Block Window setup/move steps. the do_io routine is capable + * of handling len <= PAGE_SIZE. + */ + cur_len = bip ? min(len, blk_dev->sector_size) : len; + + lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size); + ns_offset = lba * blk_dev->internal_lbasize; + dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len); + if (dev_offset == SIZE_MAX) + return -EIO; + + iobuf = kmap_atomic(page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + if (bip) { + err = nd_blk_rw_integrity(blk_dev, bip, lba, rw); + if (err) + return err; + } + len -= cur_len; + off += cur_len; + sector += blk_dev->sector_size >> SECTOR_SHIFT; + } + + return err; +} + static void nd_blk_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct gendisk *disk = bdev->bd_disk; - struct nd_namespace_blk *nsblk; + struct bio_integrity_payload *bip; struct nd_blk_device *blk_dev; - struct nd_blk_region *ndbr; struct bvec_iter iter; struct bio_vec bvec; int err = 0, rw; + /* + * bio_integrity_enabled also checks if the bio already has an + * integrity payload attached. If it does, we *don't* do a + * bio_integrity_prep here - the payload has been generated by + * another kernel subsystem, and we just pass it through. + */ + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + err = -EIO; + goto out; + } + + bip = bio_integrity(bio); blk_dev = disk->private_data; - nsblk = blk_dev->nsblk; - ndbr = blk_dev->ndbr; rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; - resource_size_t dev_offset; - void *iobuf; BUG_ON(len > PAGE_SIZE); - - dev_offset = to_dev_offset(nsblk, - iter.bi_sector << SECTOR_SHIFT, len); - if (dev_offset == SIZE_MAX) { - err = -EIO; + err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len, + bvec.bv_offset, rw, iter.bi_sector); + if (err) { + dev_info(&blk_dev->nsblk->common.dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); goto out; } - - iobuf = kmap_atomic(bvec.bv_page); - err = ndbr->do_io(ndbr, dev_offset, iobuf + bvec.bv_offset, - len, rw); - kunmap_atomic(iobuf); - if (err) - goto out; } out: @@ -121,8 +232,12 @@ static const struct block_device_operations nd_blk_fops = { static int nd_blk_attach_disk(struct nd_namespace_common *ndns, struct nd_blk_device *blk_dev) { - struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev); + resource_size_t available_disk_size; struct gendisk *disk; + u64 internal_nlba; + + internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize); + available_disk_size = internal_nlba * blk_dev->sector_size; blk_dev->queue = blk_alloc_queue(GFP_KERNEL); if (!blk_dev->queue) @@ -131,7 +246,7 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns, blk_queue_make_request(blk_dev->queue, nd_blk_make_request); blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX); blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY); - blk_queue_logical_block_size(blk_dev->queue, nsblk->lbasize); + blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue); disk = blk_dev->disk = alloc_disk(0); @@ -148,15 +263,28 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns, disk->queue = blk_dev->queue; disk->flags = GENHD_FL_EXT_DEVT; nvdimm_namespace_disk_name(ndns, disk->disk_name); - set_capacity(disk, blk_dev->disk_size >> SECTOR_SHIFT); + set_capacity(disk, 0); add_disk(disk); + if (nd_blk_meta_size(blk_dev)) { + int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev)); + + if (rc) { + del_gendisk(disk); + put_disk(disk); + blk_cleanup_queue(blk_dev->queue); + return rc; + } + } + + set_capacity(disk, available_disk_size >> SECTOR_SHIFT); return 0; } static int nd_blk_probe(struct device *dev) { struct nd_namespace_common *ndns; + struct nd_namespace_blk *nsblk; struct nd_blk_device *blk_dev; int rc; @@ -168,9 +296,13 @@ static int nd_blk_probe(struct device *dev) if (!blk_dev) return -ENOMEM; + nsblk = to_nd_namespace_blk(&ndns->dev); blk_dev->disk_size = nvdimm_namespace_capacity(ndns); blk_dev->ndbr = to_nd_blk_region(dev->parent); blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev); + blk_dev->internal_lbasize = roundup(nsblk->lbasize, + INT_LBASIZE_ALIGNMENT); + blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512); dev_set_drvdata(dev, blk_dev); ndns->rw_bytes = nd_blk_rw_bytes; diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index 2caa0ef7e67a..75b0d80a6bd9 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -31,7 +31,6 @@ #define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ #define RTT_VALID (1UL << 31) #define RTT_INVALID 0 -#define INT_LBASIZE_ALIGNMENT 64 #define BTT_PG_SIZE 4096 #define BTT_DEFAULT_NFREE ND_MAX_LANES #define LOG_SEQ_INIT 1 diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 1d96b9a6e4cc..4288169432de 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -379,6 +379,9 @@ int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) }; int ret; + if (meta_size == 0) + return 0; + ret = blk_integrity_register(disk, &integrity); if (ret) return ret; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 1ce1e70de44a..27d69bd3b4d6 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1059,7 +1059,8 @@ static ssize_t resource_show(struct device *dev, } static DEVICE_ATTR_RO(resource); -static const unsigned long ns_lbasize_supported[] = { 512, 0 }; +static const unsigned long ns_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; static ssize_t sector_size_show(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index f4459faa456c..6f916b65b7d6 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -27,6 +27,7 @@ enum { */ ND_MAX_LANES = 256, SECTOR_SHIFT = 9, + INT_LBASIZE_ALIGNMENT = 64, }; struct nvdimm_drvdata { -- cgit v1.2.3-59-g8ed1b From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:34 -0600 Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices Add support of sysfs 'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x. An example of numa_node values on a 2-socket system with a single NVDIMM range on each socket is shown below. /sys/bus/nd/devices |-- btt0.0/numa_node:0 |-- btt1.0/numa_node:1 |-- btt1.1/numa_node:1 |-- namespace0.0/numa_node:0 |-- namespace1.0/numa_node:1 |-- region0/numa_node:0 |-- region1/numa_node:1 These numa_node files are then linked under the block class of their device names. /sys/class/block/pmem0/device/numa_node:0 /sys/class/block/pmem1s/device/numa_node:1 This enables numactl(8) to accept 'block:' and 'file:' paths of pmem and btt devices as shown in the examples below. numactl --preferred block:pmem0 --show numactl --preferred file:/dev/pmem1s --show Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 1 + drivers/nvdimm/btt_devs.c | 1 + drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++ drivers/nvdimm/namespace_devs.c | 1 + include/linux/libnvdimm.h | 1 + 5 files changed, 34 insertions(+) (limited to 'drivers/nvdimm/namespace_devs.c') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index d96c8fe974dd..2161fa178c8d 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { &nd_region_attribute_group, &nd_mapping_attribute_group, &nd_device_attribute_group, + &nd_numa_attribute_group, &acpi_nfit_region_attribute_group, NULL, }; diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 661aacedc140..6ac8c0fea3ec 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -293,6 +293,7 @@ static struct attribute_group nd_btt_attribute_group = { static const struct attribute_group *nd_btt_attribute_groups[] = { &nd_btt_attribute_group, &nd_device_attribute_group, + &nd_numa_attribute_group, NULL, }; diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 205344643852..8eb22c0ca7ce 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -280,6 +280,36 @@ struct attribute_group nd_device_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_device_attribute_group); +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static struct attribute *nd_numa_attributes[] = { + &dev_attr_numa_node.attr, + NULL, +}; + +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA)) + return 0; + + return a->mode; +} + +/** + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus + */ +struct attribute_group nd_numa_attribute_group = { + .attrs = nd_numa_attributes, + .is_visible = nd_numa_attr_visible, +}; +EXPORT_SYMBOL_GPL(nd_numa_attribute_group); + int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) { dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 27d69bd3b4d6..fef0dd80d4ad 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1228,6 +1228,7 @@ static struct attribute_group nd_namespace_attribute_group = { static const struct attribute_group *nd_namespace_attribute_groups[] = { &nd_device_attribute_group, &nd_namespace_attribute_group, + &nd_numa_attribute_group, NULL, }; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 30b3deaafd51..75e3af01ee32 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -38,6 +38,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; extern struct attribute_group nd_region_attribute_group; extern struct attribute_group nd_mapping_attribute_group; -- cgit v1.2.3-59-g8ed1b