diff options
Diffstat (limited to 'drivers/dax')
-rw-r--r-- | drivers/dax/Kconfig | 18 | ||||
-rw-r--r-- | drivers/dax/Makefile | 6 | ||||
-rw-r--r-- | drivers/dax/bus.c | 1120 | ||||
-rw-r--r-- | drivers/dax/bus.h | 36 | ||||
-rw-r--r-- | drivers/dax/dax-private.h | 59 | ||||
-rw-r--r-- | drivers/dax/device.c | 256 | ||||
-rw-r--r-- | drivers/dax/hmem/Makefile | 6 | ||||
-rw-r--r-- | drivers/dax/hmem/device.c | 101 | ||||
-rw-r--r-- | drivers/dax/hmem/hmem.c (renamed from drivers/dax/hmem.c) | 23 | ||||
-rw-r--r-- | drivers/dax/kmem.c | 270 | ||||
-rw-r--r-- | drivers/dax/pmem.c (renamed from drivers/dax/pmem/core.c) | 56 | ||||
-rw-r--r-- | drivers/dax/pmem/Makefile | 1 | ||||
-rw-r--r-- | drivers/dax/pmem/compat.c | 73 | ||||
-rw-r--r-- | drivers/dax/pmem/pmem.c | 30 | ||||
-rw-r--r-- | drivers/dax/super.c | 485 |
15 files changed, 1792 insertions, 748 deletions
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3b6c06f07326..5fdf269a822e 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -config DAX_DRIVER - select DAX - bool - menuconfig DAX tristate "DAX: direct access to differentiated memory" select SRCU @@ -35,6 +31,7 @@ config DEV_DAX_PMEM config DEV_DAX_HMEM tristate "HMEM DAX: direct access to 'specific purpose' memory" depends on EFI_SOFT_RESERVE + select NUMA_KEEP_MEMINFO if (NUMA && X86) default DEV_DAX help EFI 2.8 platforms, and others, may advertise 'specific purpose' @@ -48,6 +45,10 @@ config DEV_DAX_HMEM Say M if unsure. +config DEV_DAX_HMEM_DEVICES + depends on DEV_DAX_HMEM && DAX=y + def_bool y + config DEV_DAX_KMEM tristate "KMEM DAX: volatile-use of persistent memory" default DEV_DAX @@ -65,13 +66,4 @@ config DEV_DAX_KMEM Say N if unsure. -config DEV_DAX_PMEM_COMPAT - tristate "PMEM DAX: support the deprecated /sys/class/dax interface" - depends on m && DEV_DAX_PMEM=m - default DEV_DAX_PMEM - help - Older versions of the libdaxctl library expect to find all - device-dax instances under /sys/class/dax. If libdaxctl in - your distribution is older than v58 say M, otherwise say N. - endif diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 80065b38b3c4..90a56ca3b345 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -2,11 +2,11 @@ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o -obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o dax-y := super.o dax-y += bus.o device_dax-y := device.o -dax_hmem-y := hmem.o +dax_pmem-y := pmem.o -obj-y += pmem/ +obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 46e46047a1f7..1dad813ee4a6 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -6,11 +6,10 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/dax.h> +#include <linux/io.h> #include "dax-private.h" #include "bus.h" -static struct class *dax_class; - static DEFINE_MUTEX(dax_bus_lock); #define DAX_NAME_LEN 30 @@ -89,13 +88,11 @@ static ssize_t do_id_store(struct device_driver *drv, const char *buf, list_add(&dax_id->list, &dax_drv->ids); } else rc = -ENOMEM; - } else - /* nothing to remove */; + } } else if (action == ID_REMOVE) { list_del(&dax_id->list); kfree(dax_id); - } else - /* dax_id already added */; + } mutex_unlock(&dax_bus_lock); if (rc < 0) @@ -130,10 +127,88 @@ ATTRIBUTE_GROUPS(dax_drv); static int dax_bus_match(struct device *dev, struct device_driver *drv); +/* + * Static dax regions are regions created by an external subsystem + * nvdimm where a single range is assigned. Its boundaries are by the external + * subsystem and are usually limited to one physical memory range. For example, + * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a + * single contiguous range) + * + * On dynamic dax regions, the assigned region can be partitioned by dax core + * into multiple subdivisions. A subdivision is represented into one + * /dev/daxN.M device composed by one or more potentially discontiguous ranges. + * + * When allocating a dax region, drivers must set whether it's static + * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned + * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax + * devices it is NULL but afterwards allocated by dax core on device ->probe(). + * Care is needed to make sure that dynamic dax devices are torn down with a + * cleared @pgmap field (see kill_dev_dax()). + */ +static bool is_static(struct dax_region *dax_region) +{ + return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; +} + +bool static_dev_dax(struct dev_dax *dev_dax) +{ + return is_static(dev_dax->region); +} +EXPORT_SYMBOL_GPL(static_dev_dax); + +static u64 dev_dax_size(struct dev_dax *dev_dax) +{ + u64 size = 0; + int i; + + device_lock_assert(&dev_dax->dev); + + for (i = 0; i < dev_dax->nr_range; i++) + size += range_len(&dev_dax->ranges[i].range); + + return size; +} + +static int dax_bus_probe(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + int rc; + + if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + return -ENXIO; + + rc = dax_drv->probe(dev_dax); + + if (rc || is_static(dax_region)) + return rc; + + /* + * Track new seed creation only after successful probe of the + * previous seed. + */ + if (dax_region->seed == dev) + dax_region->seed = NULL; + + return 0; +} + +static void dax_bus_remove(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + + if (dax_drv->remove) + dax_drv->remove(dev_dax); +} + static struct bus_type dax_bus_type = { .name = "dax", .uevent = dax_bus_uevent, .match = dax_bus_match, + .probe = dax_bus_probe, + .remove = dax_bus_remove, .drv_groups = dax_drv_groups, }; @@ -176,18 +251,286 @@ static ssize_t region_size_show(struct device *dev, static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); -static ssize_t align_show(struct device *dev, +static ssize_t region_align_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dax_region *dax_region = dev_get_drvdata(dev); return sprintf(buf, "%u\n", dax_region->align); } -static DEVICE_ATTR_RO(align); +static struct device_attribute dev_attr_region_align = + __ATTR(align, 0400, region_align_show, NULL); + +#define for_each_dax_region_resource(dax_region, res) \ + for (res = (dax_region)->res.child; res; res = res->sibling) + +static unsigned long long dax_region_avail_size(struct dax_region *dax_region) +{ + resource_size_t size = resource_size(&dax_region->res); + struct resource *res; + + device_lock_assert(dax_region->dev); + + for_each_dax_region_resource(dax_region, res) + size -= resource_size(res); + return size; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long size; + + device_lock(dev); + size = dax_region_avail_size(dax_region); + device_unlock(dev); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *seed; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + seed = dax_region->seed; + rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(seed); + +static ssize_t create_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *youngest; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + youngest = dax_region->youngest; + rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + device_unlock(dev); + + return rc; +} + +static ssize_t create_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long avail; + ssize_t rc; + int val; + + if (is_static(dax_region)) + return -EINVAL; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + if (val != 1) + return -EINVAL; + + device_lock(dev); + avail = dax_region_avail_size(dax_region); + if (avail == 0) + rc = -ENOSPC; + else { + struct dev_dax_data data = { + .dax_region = dax_region, + .size = 0, + .id = -1, + }; + struct dev_dax *dev_dax = devm_create_dev_dax(&data); + + if (IS_ERR(dev_dax)) + rc = PTR_ERR(dev_dax); + else { + /* + * In support of crafting multiple new devices + * simultaneously multiple seeds can be created, + * but only the first one that has not been + * successfully bound is tracked as the region + * seed. + */ + if (!dax_region->seed) + dax_region->seed = &dev_dax->dev; + dax_region->youngest = &dev_dax->dev; + rc = len; + } + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(create); + +void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + /* + * Dynamic dax region have the pgmap allocated via dev_kzalloc() + * and thus freed by devm. Clear the pgmap to not have stale pgmap + * ranges on probe() from previous reconfigurations of region devices. + */ + if (!static_dev_dax(dev_dax)) + dev_dax->pgmap = NULL; +} +EXPORT_SYMBOL_GPL(kill_dev_dax); + +static void trim_dev_dax_range(struct dev_dax *dev_dax) +{ + int i = dev_dax->nr_range - 1; + struct range *range = &dev_dax->ranges[i].range; + struct dax_region *dax_region = dev_dax->region; + + device_lock_assert(dax_region->dev); + dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long)range->start, + (unsigned long long)range->end); + + __release_region(&dax_region->res, range->start, range_len(range)); + if (--dev_dax->nr_range == 0) { + kfree(dev_dax->ranges); + dev_dax->ranges = NULL; + } +} + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + while (dev_dax->nr_range) + trim_dev_dax_range(dev_dax); +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + dev_dbg(dev, "%s\n", __func__); + + kill_dev_dax(dev_dax); + free_dev_dax_ranges(dev_dax); + device_del(dev); + put_device(dev); +} + +/* a return value >= 0 indicates this invocation invalidated the id */ +static int __free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int rc = dev_dax->id; + + device_lock_assert(dev); + + if (is_static(dax_region) || dev_dax->id < 0) + return -1; + ida_free(&dax_region->ida, dev_dax->id); + dev_dax->id = -1; + return rc; +} + +static int free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int rc; + + device_lock(dev); + rc = __free_dev_dax_id(dev_dax); + device_unlock(dev); + return rc; +} + +static ssize_t delete_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct dev_dax *dev_dax; + struct device *victim; + bool do_del = false; + int rc; + + if (is_static(dax_region)) + return -EINVAL; + + victim = device_find_child_by_name(dax_region->dev, buf); + if (!victim) + return -ENXIO; + + device_lock(dev); + device_lock(victim); + dev_dax = to_dev_dax(victim); + if (victim->driver || dev_dax_size(dev_dax)) + rc = -EBUSY; + else { + /* + * Invalidate the device so it does not become active + * again, but always preserve device-id-0 so that + * /sys/bus/dax/ is guaranteed to be populated while any + * dax_region is registered. + */ + if (dev_dax->id > 0) { + do_del = __free_dev_dax_id(dev_dax) >= 0; + rc = len; + if (dax_region->seed == victim) + dax_region->seed = NULL; + if (dax_region->youngest == victim) + dax_region->youngest = NULL; + } else + rc = -EBUSY; + } + device_unlock(victim); + + /* won the race to invalidate the device, clean it up */ + if (do_del) + devm_release_action(dev, unregister_dev_dax, victim); + device_unlock(dev); + put_device(victim); + + return rc; +} +static DEVICE_ATTR_WO(delete); + +static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dax_region *dax_region = dev_get_drvdata(dev); + + if (is_static(dax_region)) + if (a == &dev_attr_available_size.attr + || a == &dev_attr_create.attr + || a == &dev_attr_seed.attr + || a == &dev_attr_delete.attr) + return 0; + return a->mode; +} static struct attribute *dax_region_attributes[] = { + &dev_attr_available_size.attr, &dev_attr_region_size.attr, - &dev_attr_align.attr, + &dev_attr_region_align.attr, + &dev_attr_create.attr, + &dev_attr_seed.attr, + &dev_attr_delete.attr, &dev_attr_id.attr, NULL, }; @@ -195,6 +538,7 @@ static struct attribute *dax_region_attributes[] = { static const struct attribute_group dax_region_attribute_group = { .name = "dax_region", .attrs = dax_region_attributes, + .is_visible = dax_region_visible, }; static const struct attribute_group *dax_region_attribute_groups[] = { @@ -226,8 +570,8 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long pfn_flags) + struct range *range, int target_node, unsigned int align, + unsigned long flags) { struct dax_region *dax_region; @@ -241,8 +585,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; } - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) + if (!IS_ALIGNED(range->start, align) + || !IS_ALIGNED(range_len(range), align)) return NULL; dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); @@ -250,13 +594,18 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; dev_set_drvdata(parent, dax_region); - memcpy(&dax_region->res, res, sizeof(*res)); - dax_region->pfn_flags = pfn_flags; kref_init(&dax_region->kref); dax_region->id = region_id; dax_region->align = align; dax_region->dev = parent; dax_region->target_node = target_node; + ida_init(&dax_region->ida); + dax_region->res = (struct resource) { + .start = range->start, + .end = range->end, + .flags = IORESOURCE_MEM | flags, + }; + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { kfree(dax_region); return NULL; @@ -269,45 +618,606 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static void dax_mapping_release(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + + ida_free(&dev_dax->ida, mapping->id); + kfree(mapping); +} + +static void unregister_dax_mapping(void *data) +{ + struct device *dev = data; + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + dev_dbg(dev, "%s\n", __func__); + + device_lock_assert(dax_region->dev); + + dev_dax->ranges[mapping->range_id].mapping = NULL; + mapping->range_id = -1; + + device_del(dev); + put_device(dev); +} + +static struct dev_dax_range *get_dax_range(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + device_lock(dax_region->dev); + if (mapping->range_id < 0) { + device_unlock(dax_region->dev); + return NULL; + } + + return &dev_dax->ranges[mapping->range_id]; +} + +static void put_dax_range(struct dev_dax_range *dax_range) +{ + struct dax_mapping *mapping = dax_range->mapping; + struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); + struct dax_region *dax_region = dev_dax->region; + + device_unlock(dax_region->dev); +} + +static ssize_t start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.start); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(start, 0400, start_show, NULL); + +static ssize_t end_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.end); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(end, 0400, end_show, NULL); + +static ssize_t pgoff_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL); + +static struct attribute *dax_mapping_attributes[] = { + &dev_attr_start.attr, + &dev_attr_end.attr, + &dev_attr_page_offset.attr, + NULL, +}; + +static const struct attribute_group dax_mapping_attribute_group = { + .attrs = dax_mapping_attributes, +}; + +static const struct attribute_group *dax_mapping_attribute_groups[] = { + &dax_mapping_attribute_group, + NULL, +}; + +static struct device_type dax_mapping_type = { + .release = dax_mapping_release, + .groups = dax_mapping_attribute_groups, +}; + +static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) +{ + struct dax_region *dax_region = dev_dax->region; + struct dax_mapping *mapping; + struct device *dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, + "region disabled\n")) + return -ENXIO; + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return -ENOMEM; + mapping->range_id = range_id; + mapping->id = ida_alloc(&dev_dax->ida, GFP_KERNEL); + if (mapping->id < 0) { + kfree(mapping); + return -ENOMEM; + } + dev_dax->ranges[range_id].mapping = mapping; + dev = &mapping->dev; + device_initialize(dev); + dev->parent = &dev_dax->dev; + dev->type = &dax_mapping_type; + dev_set_name(dev, "mapping%d", mapping->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return rc; + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_mapping, + dev); + if (rc) + return rc; + return 0; +} + +static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, + resource_size_t size) +{ + struct dax_region *dax_region = dev_dax->region; + struct resource *res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct dev_dax_range *ranges; + unsigned long pgoff = 0; + struct resource *alloc; + int i, rc; + + device_lock_assert(dax_region->dev); + + /* handle the seed alloc special case */ + if (!size) { + if (dev_WARN_ONCE(dev, dev_dax->nr_range, + "0-size allocation must be first\n")) + return -EBUSY; + /* nr_range == 0 is elsewhere special cased as 0-size device */ + return 0; + } + + alloc = __request_region(res, start, size, dev_name(dev), 0); + if (!alloc) + return -ENOMEM; + + ranges = krealloc(dev_dax->ranges, sizeof(*ranges) + * (dev_dax->nr_range + 1), GFP_KERNEL); + if (!ranges) { + __release_region(res, alloc->start, resource_size(alloc)); + return -ENOMEM; + } + + for (i = 0; i < dev_dax->nr_range; i++) + pgoff += PHYS_PFN(range_len(&ranges[i].range)); + dev_dax->ranges = ranges; + ranges[dev_dax->nr_range++] = (struct dev_dax_range) { + .pgoff = pgoff, + .range = { + .start = alloc->start, + .end = alloc->end, + }, + }; + + dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + /* + * A dev_dax instance must be registered before mapping device + * children can be added. Defer to devm_create_dev_dax() to add + * the initial mapping device. + */ + if (!device_is_registered(&dev_dax->dev)) + return 0; + + rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); + if (rc) + trim_dev_dax_range(dev_dax); + + return rc; +} + +static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) +{ + int last_range = dev_dax->nr_range - 1; + struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; + struct dax_region *dax_region = dev_dax->region; + bool is_shrink = resource_size(res) > size; + struct range *range = &dax_range->range; + struct device *dev = &dev_dax->dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) + return -EINVAL; + + rc = adjust_resource(res, range->start, size); + if (rc) + return rc; + + *range = (struct range) { + .start = range->start, + .end = range->start + size - 1, + }; + + dev_dbg(dev, "%s range[%d]: %#llx:%#llx\n", is_shrink ? "shrink" : "extend", + last_range, (unsigned long long) range->start, + (unsigned long long) range->end); + + return 0; +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = resource_size(&dev_dax->region->res); + unsigned long long size; + + device_lock(dev); + size = dev_dax_size(dev_dax); + device_unlock(dev); return sprintf(buf, "%llu\n", size); } -static DEVICE_ATTR_RO(size); -static int dev_dax_target_node(struct dev_dax *dev_dax) +static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) +{ + /* + * The minimum mapping granularity for a device instance is a + * single subsection, unless the arch says otherwise. + */ + return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, memremap_compat_align())); +} + +static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) { + resource_size_t to_shrink = dev_dax_size(dev_dax) - size; struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int i; + + for (i = dev_dax->nr_range - 1; i >= 0; i--) { + struct range *range = &dev_dax->ranges[i].range; + struct dax_mapping *mapping = dev_dax->ranges[i].mapping; + struct resource *adjust = NULL, *res; + resource_size_t shrink; + + shrink = min_t(u64, to_shrink, range_len(range)); + if (shrink >= range_len(range)) { + devm_release_action(dax_region->dev, + unregister_dax_mapping, &mapping->dev); + trim_dev_dax_range(dev_dax); + to_shrink -= shrink; + if (!to_shrink) + break; + continue; + } + + for_each_dax_region_resource(dax_region, res) + if (strcmp(res->name, dev_name(dev)) == 0 + && res->start == range->start) { + adjust = res; + break; + } + + if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1, + "failed to find matching resource\n")) + return -ENXIO; + return adjust_dev_dax_range(dev_dax, adjust, range_len(range) + - shrink); + } + return 0; +} - return dax_region->target_node; +/* + * Only allow adjustments that preserve the relative pgoff of existing + * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff. + */ +static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res) +{ + struct dev_dax_range *last; + int i; + + if (dev_dax->nr_range == 0) + return false; + if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0) + return false; + last = &dev_dax->ranges[dev_dax->nr_range - 1]; + if (last->range.start != res->start || last->range.end != res->end) + return false; + for (i = 0; i < dev_dax->nr_range - 1; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + + if (dax_range->pgoff > last->pgoff) + return false; + } + + return true; } -static ssize_t target_node_show(struct device *dev, +static ssize_t dev_dax_resize(struct dax_region *dax_region, + struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t avail = dax_region_avail_size(dax_region), to_alloc; + resource_size_t dev_size = dev_dax_size(dev_dax); + struct resource *region_res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct resource *res, *first; + resource_size_t alloc = 0; + int rc; + + if (dev->driver) + return -EBUSY; + if (size == dev_size) + return 0; + if (size > dev_size && size - dev_size > avail) + return -ENOSPC; + if (size < dev_size) + return dev_dax_shrink(dev_dax, size); + + to_alloc = size - dev_size; + if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc), + "resize of %pa misaligned\n", &to_alloc)) + return -ENXIO; + + /* + * Expand the device into the unused portion of the region. This + * may involve adjusting the end of an existing resource, or + * allocating a new resource. + */ +retry: + first = region_res->child; + if (!first) + return alloc_dev_dax_range(dev_dax, dax_region->res.start, to_alloc); + + rc = -ENOSPC; + for (res = first; res; res = res->sibling) { + struct resource *next = res->sibling; + + /* space at the beginning of the region */ + if (res == first && res->start > dax_region->res.start) { + alloc = min(res->start - dax_region->res.start, to_alloc); + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, alloc); + break; + } + + alloc = 0; + /* space between allocations */ + if (next && next->start > res->end + 1) + alloc = min(next->start - (res->end + 1), to_alloc); + + /* space at the end of the region */ + if (!alloc && !next && res->end < region_res->end) + alloc = min(region_res->end - res->end, to_alloc); + + if (!alloc) + continue; + + if (adjust_ok(dev_dax, res)) { + rc = adjust_dev_dax_range(dev_dax, res, resource_size(res) + alloc); + break; + } + rc = alloc_dev_dax_range(dev_dax, res->end + 1, alloc); + break; + } + if (rc) + return rc; + to_alloc -= alloc; + if (to_alloc) + goto retry; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + ssize_t rc; + unsigned long long val; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + if (!alloc_is_aligned(dev_dax, val)) { + dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val); + return -EINVAL; + } + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + device_lock(dev); + rc = dev_dax_resize(dax_region, dev_dax, val); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(size); + +static ssize_t range_parse(const char *opt, size_t len, struct range *range) +{ + unsigned long long addr = 0; + char *start, *end, *str; + ssize_t rc = -EINVAL; + + str = kstrdup(opt, GFP_KERNEL); + if (!str) + return rc; + + end = str; + start = strsep(&end, "-"); + if (!start || !end) + goto err; + + rc = kstrtoull(start, 16, &addr); + if (rc) + goto err; + range->start = addr; + + rc = kstrtoull(end, 16, &addr); + if (rc) + goto err; + range->end = addr; + +err: + kfree(str); + return rc; +} + +static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + size_t to_alloc; + struct range r; + ssize_t rc; + + rc = range_parse(buf, len, &r); + if (rc) + return rc; + + rc = -ENXIO; + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return rc; + } + device_lock(dev); + + to_alloc = range_len(&r); + if (alloc_is_aligned(dev_dax, to_alloc)) + rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_WO(mapping); + +static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); + return sprintf(buf, "%d\n", dev_dax->align); } -static DEVICE_ATTR_RO(target_node); -static unsigned long long dev_dax_resource(struct dev_dax *dev_dax) +static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + size_t len = range_len(&dev_dax->ranges[i].range); + + if (!alloc_is_aligned(dev_dax, len)) { + dev_dbg(dev, "%s: align %u invalid for range %d\n", + __func__, dev_dax->align, i); + return -EINVAL; + } + } + + return 0; +} + +static ssize_t align_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long val, align_save; + ssize_t rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return -ENXIO; + + if (!dax_align_valid(val)) + return -EINVAL; + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + + device_lock(dev); + if (dev->driver) { + rc = -EBUSY; + goto out_unlock; + } + + align_save = dev_dax->align; + dev_dax->align = val; + rc = dev_dax_validate_align(dev_dax); + if (rc) + dev_dax->align = align_save; +out_unlock: + device_unlock(dev); + device_unlock(dax_region->dev); + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(align); + +static int dev_dax_target_node(struct dev_dax *dev_dax) { struct dax_region *dax_region = dev_dax->region; - return dax_region->res.start; + return dax_region->target_node; } +static ssize_t target_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); +} +static DEVICE_ATTR_RO(target_node); + static ssize_t resource_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long long start; - return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax)); + if (dev_dax->nr_range < 1) + start = dax_region->res.start; + else + start = dev_dax->ranges[0].range.start; + + return sprintf(buf, "%#llx\n", start); } static DEVICE_ATTR(resource, 0400, resource_show, NULL); @@ -333,18 +1243,26 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) return 0; if (a == &dev_attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA)) return 0; + if (a == &dev_attr_mapping.attr && is_static(dax_region)) + return 0; + if ((a == &dev_attr_align.attr || + a == &dev_attr_size.attr) && is_static(dax_region)) + return 0444; return a->mode; } static struct attribute *dev_dax_attributes[] = { &dev_attr_modalias.attr, &dev_attr_size.attr, + &dev_attr_mapping.attr, &dev_attr_target_node.attr, + &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_numa_node.attr, NULL, @@ -360,24 +1278,16 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -void kill_dev_dax(struct dev_dax *dev_dax) -{ - struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - - kill_dax(dax_dev); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); -} -EXPORT_SYMBOL_GPL(kill_dev_dax); - static void dev_dax_release(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; struct dax_device *dax_dev = dev_dax->dax_dev; - dax_region_put(dax_region); put_dax(dax_dev); + free_dev_dax_id(dev_dax); + dax_region_put(dax_region); + kfree(dev_dax->pgmap); kfree(dev_dax); } @@ -386,65 +1296,89 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; -static void unregister_dev_dax(void *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - - dev_dbg(dev, "%s\n", __func__); - - kill_dev_dax(dev_dax); - device_del(dev); - put_device(dev); -} - -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys) +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) { + struct dax_region *dax_region = data->dax_region; struct device *parent = dax_region->dev; struct dax_device *dax_dev; struct dev_dax *dev_dax; struct inode *inode; struct device *dev; - int rc = -ENOMEM; - - if (id < 0) - return ERR_PTR(-EINVAL); + int rc; dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); if (!dev_dax) return ERR_PTR(-ENOMEM); - memcpy(&dev_dax->pgmap, pgmap, sizeof(*pgmap)); + if (is_static(dax_region)) { + if (dev_WARN_ONCE(parent, data->id < 0, + "dynamic id specified to static region\n")) { + rc = -EINVAL; + goto err_id; + } + + dev_dax->id = data->id; + } else { + if (dev_WARN_ONCE(parent, data->id >= 0, + "static id specified to dynamic region\n")) { + rc = -EINVAL; + goto err_id; + } + + rc = ida_alloc(&dax_region->ida, GFP_KERNEL); + if (rc < 0) + goto err_id; + dev_dax->id = rc; + } + + dev_dax->region = dax_region; + dev = &dev_dax->dev; + device_initialize(dev); + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); + + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, data->size); + if (rc) + goto err_range; + + if (data->pgmap) { + dev_WARN_ONCE(parent, !is_static(dax_region), + "custom dev_pagemap requires a static dax_region\n"); + + dev_dax->pgmap = kmemdup(data->pgmap, + sizeof(struct dev_pagemap), GFP_KERNEL); + if (!dev_dax->pgmap) { + rc = -ENOMEM; + goto err_pgmap; + } + } /* - * No 'host' or dax_operations since there is no access to this - * device outside of mmap of the resulting character device. + * No dax_operations since there is no access to this device outside of + * mmap of the resulting character device. */ - dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); - if (!dax_dev) - goto err; + dax_dev = alloc_dax(dev_dax, NULL); + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); + goto err_alloc_dax; + } + set_dax_synchronous(dax_dev); + set_dax_nocache(dax_dev); + set_dax_nomc(dax_dev); /* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); - /* from here on we're committed to teardown via dax_dev_release() */ - dev = &dev_dax->dev; - device_initialize(dev); - dev_dax->dax_dev = dax_dev; - dev_dax->region = dax_region; dev_dax->target_node = dax_region->target_node; + dev_dax->align = dax_region->align; + ida_init(&dev_dax->ida); kref_get(&dax_region->kref); inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; - if (subsys == DEV_DAX_BUS) - dev->bus = &dax_bus_type; - else - dev->class = dax_class; + dev->bus = &dax_bus_type; dev->parent = parent; dev->type = &dev_dax_type; - dev_set_name(dev, "dax%d.%d", dax_region->id, id); rc = device_add(dev); if (rc) { @@ -457,14 +1391,27 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, if (rc) return ERR_PTR(rc); + /* register mapping device for the initial allocation range */ + if (dev_dax->nr_range && range_len(&dev_dax->ranges[0].range)) { + rc = devm_register_dax_mapping(dev_dax, 0); + if (rc) + return ERR_PTR(rc); + } + return dev_dax; - err: +err_alloc_dax: + kfree(dev_dax->pgmap); +err_pgmap: + free_dev_dax_ranges(dev_dax); +err_range: + free_dev_dax_id(dev_dax); +err_id: kfree(dev_dax); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(__devm_create_dev_dax); +EXPORT_SYMBOL_GPL(devm_create_dev_dax); static int match_always_count; @@ -474,6 +1421,13 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, struct device_driver *drv = &dax_drv->drv; int rc = 0; + /* + * dax_bus_probe() calls dax_drv->probe() unconditionally. + * So better be safe than sorry and ensure it is provided. + */ + if (!dax_drv->probe) + return -EINVAL; + INIT_LIST_HEAD(&dax_drv->ids); drv->owner = module; drv->name = mod_name; @@ -491,7 +1445,15 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, mutex_unlock(&dax_bus_lock); if (rc) return rc; - return driver_register(drv); + + rc = driver_register(drv); + if (rc && dax_drv->match_always) { + mutex_lock(&dax_bus_lock); + match_always_count -= dax_drv->match_always; + mutex_unlock(&dax_bus_lock); + } + + return rc; } EXPORT_SYMBOL_GPL(__dax_driver_register); @@ -513,22 +1475,10 @@ EXPORT_SYMBOL_GPL(dax_driver_unregister); int __init dax_bus_init(void) { - int rc; - - if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) { - dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) - return PTR_ERR(dax_class); - } - - rc = bus_register(&dax_bus_type); - if (rc) - class_destroy(dax_class); - return rc; + return bus_register(&dax_bus_type); } void __exit dax_bus_exit(void) { bus_unregister(&dax_bus_type); - class_destroy(dax_class); } diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 9e4eba67e8b9..fbb940293d6d 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -3,37 +3,34 @@ #ifndef __DAX_BUS_H__ #define __DAX_BUS_H__ #include <linux/device.h> +#include <linux/range.h> struct dev_dax; struct resource; struct dax_device; struct dax_region; void dax_region_put(struct dax_region *dax_region); -struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long flags); -enum dev_dax_subsys { - DEV_DAX_BUS, - DEV_DAX_CLASS, +#define IORESOURCE_DAX_STATIC (1UL << 0) +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct range *range, int target_node, unsigned int align, + unsigned long flags); + +struct dev_dax_data { + struct dax_region *dax_region; + struct dev_pagemap *pgmap; + resource_size_t size; + int id; }; -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys); - -static inline struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - int id, struct dev_pagemap *pgmap) -{ - return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS); -} - -/* to be deleted when DEV_DAX_CLASS is removed */ -struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); struct dax_device_driver { struct device_driver drv; struct list_head ids; int match_always; + int (*probe)(struct dev_dax *dev); + void (*remove)(struct dev_dax *dev); }; int __dax_driver_register(struct dax_device_driver *dax_drv, @@ -42,10 +39,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); - -#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) -int dev_dax_probe(struct device *dev); -#endif +bool static_dev_dax(struct dev_dax *dev_dax); /* * While run_dax() is potentially a generic operation that could be diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 3107ce80e809..1c974b7caae6 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -7,6 +7,7 @@ #include <linux/device.h> #include <linux/cdev.h> +#include <linux/idr.h> /* private routines between core files */ struct dax_device; @@ -22,8 +23,10 @@ void dax_bus_exit(void); * @kref: to pin while other agents have a need to do lookups * @dev: parent device backing this region * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not + * @ida: instance id allocator + * @res: resource tree to track instance allocations + * @seed: allow userspace to find the first unbound seed device + * @youngest: allow userspace to find the most recently created device */ struct dax_region { int id; @@ -31,8 +34,16 @@ struct dax_region { struct kref kref; struct device *dev; unsigned int align; + struct ida ida; struct resource res; - unsigned long long pfn_flags; + struct device *seed; + struct device *youngest; +}; + +struct dax_mapping { + struct device dev; + int range_id; + int id; }; /** @@ -41,21 +52,57 @@ struct dax_region { * @region - parent region * @dax_dev - core dax functionality * @target_node: effective numa node if dev_dax memory range is onlined + * @id: ida allocated id + * @ida: mapping id allocator * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) - * @dax_mem_res: physical address range of hotadded DAX memory + * @nr_range: size of @ranges + * @ranges: resource-span + pgoff tuples for the instance */ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + unsigned int align; int target_node; + int id; + struct ida ida; struct device dev; - struct dev_pagemap pgmap; - struct resource *dax_kmem_res; + struct dev_pagemap *pgmap; + int nr_range; + struct dev_dax_range { + unsigned long pgoff; + struct range range; + struct dax_mapping *mapping; + } *ranges; }; static inline struct dev_dax *to_dev_dax(struct device *dev) { return container_of(dev, struct dev_dax, dev); } + +static inline struct dax_mapping *to_dax_mapping(struct device *dev) +{ + return container_of(dev, struct dax_mapping, dev); +} + +phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool dax_align_valid(unsigned long align) +{ + if (align == PUD_SIZE && IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + return true; + if (align == PMD_SIZE && has_transparent_hugepage()) + return true; + if (align == PAGE_SIZE) + return true; + return false; +} +#else +static inline bool dax_align_valid(unsigned long align) +{ + return align == PAGE_SIZE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6b..5494d745ced5 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -17,7 +17,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - struct dax_region *dax_region = dev_dax->region; struct device *dev = &dev_dax->dev; unsigned long mask; @@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - mask = dax_region->align - 1; + mask = dev_dax->align - 1; if (vma->vm_start & mask || vma->vm_end & mask) { dev_info_ratelimited(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", @@ -41,14 +40,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV - && (vma->vm_flags & VM_DONTCOPY) == 0) { - dev_info_ratelimited(dev, - "%s: %s: fail, dax range requires MADV_DONTFORK\n", - current->comm, func); - return -EINVAL; - } - if (!vma_is_dax(vma)) { dev_info_ratelimited(dev, "%s: %s: fail, vma is not DAX capable\n", @@ -63,37 +54,70 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res = &dev_dax->region->res; - phys_addr_t phys; - - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) { - if (phys + size - 1 <= res->end) + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t phys; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) return phys; + break; } - return -1; } +static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) +{ + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + pgoff_t pgoff; + + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + + page = compound_head(page); + if (page->mapping) + continue; + + page->mapping = filp->f_mapping; + page->index = pgoff + i; + } +} + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; + pfn_t pfn; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PAGE_SIZE) { + if (dev_dax->align > PAGE_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size != dax_region->align) + if (fault_size != dev_dax->align) return VM_FAULT_SIGBUS; phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); @@ -102,40 +126,35 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); } static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PMD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PMD_SIZE) { + if (dev_dax->align > PMD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - /* dax pmd mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size < dax_region->align) + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -150,42 +169,37 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PUD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PUD_SIZE) { + if (dev_dax->align > PUD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - /* dax pud mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -200,13 +214,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } @@ -216,10 +232,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { struct file *filp = vmf->vma->vm_file; - unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; struct dev_dax *dev_dax = filp->private_data; dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, @@ -229,43 +243,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - fault_size = PAGE_SIZE; - rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - fault_size = PMD_SIZE; - rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - fault_size = PUD_SIZE; - rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, vmf->address - & ~(fault_size - 1)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } dax_read_unlock(id); return rc; @@ -276,13 +265,12 @@ static vm_fault_t dev_dax_fault(struct vm_fault *vmf) return dev_dax_huge_fault(vmf, PE_SIZE_PTE); } -static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) +static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - if (!IS_ALIGNED(addr, dax_region->align)) + if (!IS_ALIGNED(addr, dev_dax->align)) return -EINVAL; return 0; } @@ -291,15 +279,14 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - return dax_region->align; + return dev_dax->align; } static const struct vm_operations_struct dax_vm_ops = { .fault = dev_dax_fault, .huge_fault = dev_dax_huge_fault, - .split = dev_dax_split, + .may_split = dev_dax_may_split, .pagesize = dev_dax_pagesize, }; @@ -332,13 +319,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp, { unsigned long off, off_end, off_align, len_align, addr_align, align; struct dev_dax *dev_dax = filp ? filp->private_data : NULL; - struct dax_region *dax_region; if (!dev_dax || addr) goto out; - dax_region = dev_dax->region; - align = dax_region->align; + align = dev_dax->align; off = pgoff << PAGE_SHIFT; off_end = off + len; off_align = round_up(off, align); @@ -361,8 +346,7 @@ static unsigned long dax_get_unmapped_area(struct file *filp, } static const struct address_space_operations dev_dax_aops = { - .set_page_dirty = noop_set_page_dirty, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, }; static int dax_open(struct inode *inode, struct file *filp) @@ -377,6 +361,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; @@ -411,36 +396,69 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -int dev_dax_probe(struct device *dev) +int dev_dax_probe(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; - struct resource *res = &dev_dax->region->res; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; struct inode *inode; struct cdev *cdev; void *addr; - int rc; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } - /* 1:1 map region resource range to device-dax instance range */ - if (!devm_request_mem_region(dev, res->start, resource_size(res), - dev_name(dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); - return -EBUSY; + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + pgmap->ranges[i] = *range; + } } - dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; - addr = devm_memremap_pages(dev, &dev_dax->pgmap); + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + } + + pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); + addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); - if (dev->class) { - /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ - cdev->owner = dev->parent->driver->owner; - } else - cdev->owner = dev->driver->owner; + cdev->owner = dev->driver->owner; cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) @@ -455,17 +473,9 @@ int dev_dax_probe(struct device *dev) } EXPORT_SYMBOL_GPL(dev_dax_probe); -static int dev_dax_remove(struct device *dev) -{ - /* all probe actions are unwound by devm */ - return 0; -} - static struct dax_device_driver device_dax_driver = { - .drv = { - .probe = dev_dax_probe, - .remove = dev_dax_remove, - }, + .probe = dev_dax_probe, + /* all probe actions are unwound by devm, so .remove isn't necessary */ .match_always = 1, }; diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile new file mode 100644 index 000000000000..57377b4c3d47 --- /dev/null +++ b/drivers/dax/hmem/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o + +device_hmem-y := device.o +dax_hmem-y := hmem.o diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c new file mode 100644 index 000000000000..97086fab698e --- /dev/null +++ b/drivers/dax/hmem/device.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/platform_device.h> +#include <linux/memregion.h> +#include <linux/module.h> +#include <linux/dax.h> +#include <linux/mm.h> + +static bool nohmem; +module_param_named(disable, nohmem, bool, 0444); + +void hmem_register_device(int target_nid, struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_SOFT_RESERVED, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + if (nohmem) + return; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = numa_map_to_online_node(target_nid); + info = (struct memregion_info) { + .target_node = target_nid, + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + platform_device_put(pdev); +out_pdev: + memregion_free(id); +} + +static __init int hmem_register_one(struct resource *res, void *data) +{ + /* + * If the resource is not a top-level resource it was already + * assigned to a device by the HMAT parsing. + */ + if (res->parent != &iomem_resource) { + pr_info("HMEM: skip %pr, already claimed\n", res); + return 0; + } + + hmem_register_device(phys_to_target_node(res->start), res); + + return 0; +} + +static __init int hmem_init(void) +{ + walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, + IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + return 0; +} + +/* + * As this is a fallback for address ranges unclaimed by the ACPI HMAT + * parsing it must be at an initcall level greater than hmat_init(). + */ +late_initcall(hmem_init); diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem/hmem.c index fe7214daf62e..1bf040dbc834 100644 --- a/drivers/dax/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,30 +3,39 @@ #include <linux/memregion.h> #include <linux/module.h> #include <linux/pfn_t.h> -#include "bus.h" +#include "../bus.h" + +static bool region_idle; +module_param_named(region_idle, region_idle, bool, 0644); static int dax_hmem_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct dev_pagemap pgmap = { }; struct dax_region *dax_region; struct memregion_info *mri; + struct dev_dax_data data; struct dev_dax *dev_dax; struct resource *res; + struct range range; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENOMEM; mri = dev->platform_data; - memcpy(&pgmap.res, res, sizeof(*res)); - - dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, - PMD_SIZE, PFN_DEV|PFN_MAP); + range.start = res->start; + range.end = res->end; + dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node, + PMD_SIZE, 0); if (!dax_region) return -ENOMEM; - dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = -1, + .size = region_idle ? 0 : resource_size(res), + }; + dev_dax = devm_create_dev_dax(&data); if (IS_ERR(dev_dax)) return PTR_ERR(dev_dax); diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 3d0a7e702c94..4852a2dbdb27 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -11,19 +11,52 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/memory-tiers.h> #include "dax-private.h" #include "bus.h" -int dev_dax_kmem_probe(struct device *dev) +/* + * Default abstract distance assigned to the NUMA node onlined + * by DAX/kmem if the low level platform driver didn't initialize + * one for this NUMA node. + */ +#define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) + +/* Memory resource name used for add_memory_driver_managed(). */ +static const char *kmem_name; +/* Set if any memory will remain added when the driver will be unloaded. */ +static bool any_hotremove_failed; + +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = &dev_dax->region->res; - resource_size_t kmem_start; - resource_size_t kmem_size; - resource_size_t kmem_end; - struct resource *new_res; + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + + /* memory-block align the hotplug range */ + r->start = ALIGN(range->start, memory_block_size_bytes()); + r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; + if (r->start >= r->end) { + r->start = range->start; + r->end = range->end; + return -ENOSPC; + } + return 0; +} + +struct dax_kmem_data { + const char *res_name; + int mgid; + struct resource *res[]; +}; + +static struct memory_dev_type *dax_slowmem_type; +static int dev_dax_kmem_probe(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + unsigned long total_len = 0; + struct dax_kmem_data *data; + int i, rc, mapped = 0; int numa_node; - int rc; /* * Ensure good NUMA information for the persistent memory. @@ -33,57 +66,118 @@ int dev_dax_kmem_probe(struct device *dev) */ numa_node = dev_dax->target_node; if (numa_node < 0) { - dev_warn(dev, "rejecting DAX region %pR with invalid node: %d\n", - res, numa_node); + dev_warn(dev, "rejecting DAX region with invalid node: %d\n", + numa_node); return -EINVAL; } - /* Hotplug starting at the beginning of the next block: */ - kmem_start = ALIGN(res->start, memory_block_size_bytes()); - - kmem_size = resource_size(res); - /* Adjust the size down to compensate for moving up kmem_start: */ - kmem_size -= kmem_start - res->start; - /* Align the size down to cover only complete blocks: */ - kmem_size &= ~(memory_block_size_bytes() - 1); - kmem_end = kmem_start + kmem_size; - - /* Region is permanently reserved. Hot-remove not yet implemented. */ - new_res = request_mem_region(kmem_start, kmem_size, dev_name(dev)); - if (!new_res) { - dev_warn(dev, "could not reserve region [%pa-%pa]\n", - &kmem_start, &kmem_end); - return -EBUSY; + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) { + dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", + i, range.start, range.end); + continue; + } + total_len += range_len(&range); } - /* - * Set flags appropriate for System RAM. Leave ..._BUSY clear - * so that add_memory() can add a child resource. Do not - * inherit flags from the parent since it may set new flags - * unknown to us that will break add_memory() below. - */ - new_res->flags = IORESOURCE_SYSTEM_RAM; - new_res->name = dev_name(dev); - - rc = add_memory(numa_node, new_res->start, resource_size(new_res)); - if (rc) { - release_resource(new_res); - kfree(new_res); - return rc; + if (!total_len) { + dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); + return -EINVAL; + } + + init_node_memory_type(numa_node, dax_slowmem_type); + + rc = -ENOMEM; + data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); + if (!data) + goto err_dax_kmem_data; + + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!data->res_name) + goto err_res_name; + + rc = memory_group_register_static(numa_node, total_len); + if (rc < 0) + goto err_reg_mgid; + data->mgid = rc; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct resource *res; + struct range range; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) + continue; + + /* Region is permanently reserved if hotremove fails. */ + res = request_mem_region(range.start, range_len(&range), data->res_name); + if (!res) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", + i, range.start, range.end); + /* + * Once some memory has been onlined we can't + * assume that it can be un-onlined safely. + */ + if (mapped) + continue; + rc = -EBUSY; + goto err_request_mem; + } + data->res[i] = res; + + /* + * Set flags appropriate for System RAM. Leave ..._BUSY clear + * so that add_memory() can add a child resource. Do not + * inherit flags from the parent since it may set new flags + * unknown to us that will break add_memory() below. + */ + res->flags = IORESOURCE_SYSTEM_RAM; + + /* + * Ensure that future kexec'd kernels will not treat + * this as RAM automatically. + */ + rc = add_memory_driver_managed(data->mgid, range.start, + range_len(&range), kmem_name, MHP_NID_IS_MGID); + + if (rc) { + dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", + i, range.start, range.end); + release_resource(res); + kfree(res); + data->res[i] = NULL; + if (mapped) + continue; + goto err_request_mem; + } + mapped++; } - dev_dax->dax_kmem_res = new_res; + + dev_set_drvdata(dev, data); return 0; + +err_request_mem: + memory_group_unregister(data->mgid); +err_reg_mgid: + kfree(data->res_name); +err_res_name: + kfree(data); +err_dax_kmem_data: + clear_node_memory_type(numa_node, dax_slowmem_type); + return rc; } #ifdef CONFIG_MEMORY_HOTREMOVE -static int dev_dax_kmem_remove(struct device *dev) +static void dev_dax_kmem_remove(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = dev_dax->dax_kmem_res; - resource_size_t kmem_start = res->start; - resource_size_t kmem_size = resource_size(res); - int rc; + int i, success = 0; + int node = dev_dax->target_node; + struct device *dev = &dev_dax->dev; + struct dax_kmem_data *data = dev_get_drvdata(dev); /* * We have one shot for removing memory, if some memory blocks were not @@ -91,23 +185,45 @@ static int dev_dax_kmem_remove(struct device *dev) * there is no way to hotremove this memory until reboot because device * unbind will succeed even if we return failure. */ - rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); - if (rc) { + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) + continue; + + rc = remove_memory(range.start, range_len(&range)); + if (rc == 0) { + release_resource(data->res[i]); + kfree(data->res[i]); + data->res[i] = NULL; + success++; + continue; + } + any_hotremove_failed = true; dev_err(dev, - "DAX region %pR cannot be hotremoved until the next reboot\n", - res); - return rc; + "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", + i, range.start, range.end); } - /* Release and free dax resources */ - release_resource(res); - kfree(res); - dev_dax->dax_kmem_res = NULL; - - return 0; + if (success >= dev_dax->nr_range) { + memory_group_unregister(data->mgid); + kfree(data->res_name); + kfree(data); + dev_set_drvdata(dev, NULL); + /* + * Clear the memtype association on successful unplug. + * If not, we have memory blocks left which can be + * offlined/onlined later. We need to keep memory_dev_type + * for that. This implies this reference will be around + * till next reboot. + */ + clear_node_memory_type(node, dax_slowmem_type); + } } #else -static int dev_dax_kmem_remove(struct device *dev) +static void dev_dax_kmem_remove(struct dev_dax *dev_dax) { /* * Without hotremove purposely leak the request_mem_region() for the @@ -116,25 +232,49 @@ static int dev_dax_kmem_remove(struct device *dev) * permanently pinned as reserved by the unreleased * request_mem_region(). */ - return 0; + any_hotremove_failed = true; } #endif /* CONFIG_MEMORY_HOTREMOVE */ static struct dax_device_driver device_dax_kmem_driver = { - .drv = { - .probe = dev_dax_kmem_probe, - .remove = dev_dax_kmem_remove, - }, + .probe = dev_dax_kmem_probe, + .remove = dev_dax_kmem_remove, }; static int __init dax_kmem_init(void) { - return dax_driver_register(&device_dax_kmem_driver); + int rc; + + /* Resource name is permanently allocated if any hotremove fails. */ + kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); + if (!kmem_name) + return -ENOMEM; + + dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); + if (IS_ERR(dax_slowmem_type)) { + rc = PTR_ERR(dax_slowmem_type); + goto err_dax_slowmem_type; + } + + rc = dax_driver_register(&device_dax_kmem_driver); + if (rc) + goto error_dax_driver; + + return rc; + +error_dax_driver: + destroy_memory_type(dax_slowmem_type); +err_dax_slowmem_type: + kfree_const(kmem_name); + return rc; } static void __exit dax_kmem_exit(void) { dax_driver_unregister(&device_dax_kmem_driver); + if (!any_hotremove_failed) + kfree_const(kmem_name); + destroy_memory_type(dax_slowmem_type); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem.c index 2bedf8414fff..f050ea78bb83 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem.c @@ -3,17 +3,18 @@ #include <linux/memremap.h> #include <linux/module.h> #include <linux/pfn_t.h> -#include "../../nvdimm/pfn.h" -#include "../../nvdimm/nd.h" -#include "../bus.h" +#include "../nvdimm/pfn.h" +#include "../nvdimm/nd.h" +#include "bus.h" -struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) +static struct dev_dax *__dax_pmem_probe(struct device *dev) { - struct resource res; + struct range range; int rc, id, region_id; resource_size_t offset; struct nd_pfn_sb *pfn_sb; struct dev_dax *dev_dax; + struct dev_dax_data data; struct nd_namespace_io *nsio; struct dax_region *dax_region; struct dev_pagemap pgmap = { }; @@ -49,23 +50,54 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) if (rc != 2) return ERR_PTR(-EINVAL); - /* adjust the dax_region resource to the start of data */ - memcpy(&res, &pgmap.res, sizeof(res)); - res.start += offset; - dax_region = alloc_dax_region(dev, region_id, &res, + /* adjust the dax_region range to the start of data */ + range = pgmap.range; + range.start += offset; + dax_region = alloc_dax_region(dev, region_id, &range, nd_region->target_node, le32_to_cpu(pfn_sb->align), - PFN_DEV|PFN_MAP); + IORESOURCE_DAX_STATIC); if (!dax_region) return ERR_PTR(-ENOMEM); - dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = id, + .pgmap = &pgmap, + .size = range_len(&range), + }; + dev_dax = devm_create_dev_dax(&data); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); return dev_dax; } -EXPORT_SYMBOL_GPL(__dax_pmem_probe); + +static int dax_pmem_probe(struct device *dev) +{ + return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev)); +} + +static struct nd_device_driver dax_pmem_driver = { + .probe = dax_pmem_probe, + .drv = { + .name = "dax_pmem", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +static int __init dax_pmem_init(void) +{ + return nd_driver_register(&dax_pmem_driver); +} +module_init(dax_pmem_init); + +static void __exit dax_pmem_exit(void) +{ + driver_unregister(&dax_pmem_driver.drv); +} +module_exit(dax_pmem_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); diff --git a/drivers/dax/pmem/Makefile b/drivers/dax/pmem/Makefile index 010269f61d41..191c31f0d4f0 100644 --- a/drivers/dax/pmem/Makefile +++ b/drivers/dax/pmem/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o -obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o dax_pmem-y := pmem.o dax_pmem_core-y := core.o diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c deleted file mode 100644 index d7b15e6f30c5..000000000000 --- a/drivers/dax/pmem/compat.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ -#include <linux/percpu-refcount.h> -#include <linux/memremap.h> -#include <linux/module.h> -#include <linux/pfn_t.h> -#include <linux/nd.h> -#include "../bus.h" - -/* we need the private definitions to implement compat suport */ -#include "../dax-private.h" - -static int dax_pmem_compat_probe(struct device *dev) -{ - struct dev_dax *dev_dax = __dax_pmem_probe(dev, DEV_DAX_CLASS); - int rc; - - if (IS_ERR(dev_dax)) - return PTR_ERR(dev_dax); - - if (!devres_open_group(&dev_dax->dev, dev_dax, GFP_KERNEL)) - return -ENOMEM; - - device_lock(&dev_dax->dev); - rc = dev_dax_probe(&dev_dax->dev); - device_unlock(&dev_dax->dev); - - devres_close_group(&dev_dax->dev, dev_dax); - if (rc) - devres_release_group(&dev_dax->dev, dev_dax); - - return rc; -} - -static int dax_pmem_compat_release(struct device *dev, void *data) -{ - device_lock(dev); - devres_release_group(dev, to_dev_dax(dev)); - device_unlock(dev); - - return 0; -} - -static int dax_pmem_compat_remove(struct device *dev) -{ - device_for_each_child(dev, NULL, dax_pmem_compat_release); - return 0; -} - -static struct nd_device_driver dax_pmem_compat_driver = { - .probe = dax_pmem_compat_probe, - .remove = dax_pmem_compat_remove, - .drv = { - .name = "dax_pmem_compat", - }, - .type = ND_DRIVER_DAX_PMEM, -}; - -static int __init dax_pmem_compat_init(void) -{ - return nd_driver_register(&dax_pmem_compat_driver); -} -module_init(dax_pmem_compat_init); - -static void __exit dax_pmem_compat_exit(void) -{ - driver_unregister(&dax_pmem_compat_driver.drv); -} -module_exit(dax_pmem_compat_exit); - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Intel Corporation"); -MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); diff --git a/drivers/dax/pmem/pmem.c b/drivers/dax/pmem/pmem.c index 0ae4238a0ef8..dfe91a2990fe 100644 --- a/drivers/dax/pmem/pmem.c +++ b/drivers/dax/pmem/pmem.c @@ -7,34 +7,4 @@ #include <linux/nd.h> #include "../bus.h" -static int dax_pmem_probe(struct device *dev) -{ - return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev, DEV_DAX_BUS)); -} -static struct nd_device_driver dax_pmem_driver = { - .probe = dax_pmem_probe, - .drv = { - .name = "dax_pmem", - }, - .type = ND_DRIVER_DAX_PMEM, -}; - -static int __init dax_pmem_init(void) -{ - return nd_driver_register(&dax_pmem_driver); -} -module_init(dax_pmem_init); - -static void __exit dax_pmem_exit(void) -{ - driver_unregister(&dax_pmem_driver.drv); -} -module_exit(dax_pmem_exit); - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Intel Corporation"); -#if !IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) -/* For compat builds, don't load this module by default */ -MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); -#endif diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0aa4b6bc5101..da4438f3188c 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -7,16 +7,34 @@ #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/magic.h> -#include <linux/genhd.h> #include <linux/pfn_t.h> #include <linux/cdev.h> -#include <linux/hash.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/dax.h> #include <linux/fs.h> #include "dax-private.h" +/** + * struct dax_device - anchor object for dax services + * @inode: core vfs + * @cdev: optional character interface for "device dax" + * @private: dax driver private data + * @flags: state and boolean properties + * @ops: operations for this device + * @holder_data: holder of a dax_device: could be filesystem or mapped device + * @holder_ops: operations for the inner holder + */ +struct dax_device { + struct inode inode; + struct cdev cdev; + void *private; + unsigned long flags; + const struct dax_operations *ops; + void *holder_data; + const struct dax_holder_operations *holder_ops; +}; + static dev_t dax_devt; DEFINE_STATIC_SRCU(dax_srcu); static struct vfsmount *dax_mnt; @@ -24,10 +42,6 @@ static DEFINE_IDA(dax_minor_ida); static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; -#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) -static struct hlist_head dax_host_list[DAX_HASH_SIZE]; -static DEFINE_SPINLOCK(dax_host_lock); - int dax_read_lock(void) { return srcu_read_lock(&dax_srcu); @@ -40,156 +54,72 @@ void dax_read_unlock(int id) } EXPORT_SYMBOL_GPL(dax_read_unlock); -#ifdef CONFIG_BLOCK +#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) #include <linux/blkdev.h> -int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, - pgoff_t *pgoff) -{ - phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; - - if (pgoff) - *pgoff = PHYS_PFN(phys_off); - if (phys_off % PAGE_SIZE || size % PAGE_SIZE) - return -EINVAL; - return 0; -} -EXPORT_SYMBOL(bdev_dax_pgoff); +static DEFINE_XARRAY(dax_hosts); -#if IS_ENABLED(CONFIG_FS_DAX) -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) +int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { - if (!blk_queue_dax(bdev->bd_queue)) - return NULL; - return dax_get_by_host(bdev->bd_disk->disk_name); + return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); -#endif - -bool __generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors) -{ - bool dax_enabled = false; - pgoff_t pgoff, pgoff_end; - char buf[BDEVNAME_SIZE]; - void *kaddr, *end_kaddr; - pfn_t pfn, end_pfn; - sector_t last_page; - long len, len2; - int err, id; - - if (blocksize != PAGE_SIZE) { - pr_debug("%s: error: unsupported blocksize for dax\n", - bdevname(bdev, buf)); - return false; - } - - err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); - if (err) { - pr_debug("%s: error: unaligned partition for dax\n", - bdevname(bdev, buf)); - return false; - } - - last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; - err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); - if (err) { - pr_debug("%s: error: unaligned partition for dax\n", - bdevname(bdev, buf)); - return false; - } +EXPORT_SYMBOL_GPL(dax_add_host); - id = dax_read_lock(); - len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); - len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); - dax_read_unlock(id); - - if (len < 1 || len2 < 1) { - pr_debug("%s: error: dax access failed (%ld)\n", - bdevname(bdev, buf), len < 1 ? len : len2); - return false; - } - - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { - /* - * An arch that has enabled the pmem api should also - * have its drivers support pfn_t_devmap() - * - * This is a developer warning and should not trigger in - * production. dax_flush() will crash since it depends - * on being able to do (page_address(pfn_to_page())). - */ - WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); - dax_enabled = true; - } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { - struct dev_pagemap *pgmap, *end_pgmap; - - pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); - end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); - if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX - && pfn_t_to_page(pfn)->pgmap == pgmap - && pfn_t_to_page(end_pfn)->pgmap == pgmap - && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) - && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) - dax_enabled = true; - put_dev_pagemap(pgmap); - put_dev_pagemap(end_pgmap); - - } - - if (!dax_enabled) { - pr_debug("%s: error: dax support not enabled\n", - bdevname(bdev, buf)); - return false; - } - return true; +void dax_remove_host(struct gendisk *disk) +{ + xa_erase(&dax_hosts, (unsigned long)disk); } -EXPORT_SYMBOL_GPL(__generic_fsdax_supported); +EXPORT_SYMBOL_GPL(dax_remove_host); /** - * __bdev_dax_supported() - Check if the device supports dax for filesystem - * @bdev: block device to check - * @blocksize: The block size of the device - * - * This is a library function for filesystems to check if the block device - * can be mounted with dax option. - * - * Return: true if supported, false if unsupported + * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax + * @bdev: block device to find a dax_device for + * @start_off: returns the byte offset into the dax_device that @bdev starts + * @holder: filesystem or mapped device inside the dax_device + * @ops: operations for the inner holder */ -bool __bdev_dax_supported(struct block_device *bdev, int blocksize) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; - struct request_queue *q; - char buf[BDEVNAME_SIZE]; - bool ret; + u64 part_size; int id; - q = bdev_get_queue(bdev); - if (!q || !blk_queue_dax(q)) { - pr_debug("%s: error: request queue doesn't support dax\n", - bdevname(bdev, buf)); - return false; - } + if (!blk_queue_dax(bdev->bd_disk->queue)) + return NULL; - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) { - pr_debug("%s: error: device does not support dax\n", - bdevname(bdev, buf)); - return false; + *start_off = get_start_sect(bdev) * SECTOR_SIZE; + part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE; + if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) { + pr_info("%pg: error: unaligned partition for dax\n", bdev); + return NULL; } id = dax_read_lock(); - ret = dax_supported(dax_dev, bdev, blocksize, 0, - i_size_read(bdev->bd_inode) / 512); + dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) + dax_dev = NULL; + else if (holder) { + if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) + dax_dev->holder_ops = ops; + else + dax_dev = NULL; + } dax_read_unlock(id); - put_dax(dax_dev); + return dax_dev; +} +EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); - return ret; +void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ + if (dax_dev && holder && + cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) + dax_dev->holder_ops = NULL; + put_dax(dax_dev); } -EXPORT_SYMBOL_GPL(__bdev_dax_supported); -#endif +EXPORT_SYMBOL_GPL(fs_put_dax); +#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ @@ -198,95 +128,18 @@ enum dax_device_flags { DAXDEV_WRITE_CACHE, /* flag to check if device supports synchronous flush */ DAXDEV_SYNC, + /* do not leave the caches dirty after writes */ + DAXDEV_NOCACHE, + /* handle CPU fetch exceptions during reads */ + DAXDEV_NOMC, }; /** - * struct dax_device - anchor object for dax services - * @inode: core vfs - * @cdev: optional character interface for "device dax" - * @host: optional name for lookups where the device path is not available - * @private: dax driver private data - * @flags: state and boolean properties - */ -struct dax_device { - struct hlist_node list; - struct inode inode; - struct cdev cdev; - const char *host; - void *private; - unsigned long flags; - const struct dax_operations *ops; -}; - -static ssize_t write_cache_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); - ssize_t rc; - - WARN_ON_ONCE(!dax_dev); - if (!dax_dev) - return -ENXIO; - - rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); - put_dax(dax_dev); - return rc; -} - -static ssize_t write_cache_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - bool write_cache; - int rc = strtobool(buf, &write_cache); - struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); - - WARN_ON_ONCE(!dax_dev); - if (!dax_dev) - return -ENXIO; - - if (rc) - len = rc; - else - dax_write_cache(dax_dev, write_cache); - - put_dax(dax_dev); - return len; -} -static DEVICE_ATTR_RW(write_cache); - -static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) -{ - struct device *dev = container_of(kobj, typeof(*dev), kobj); - struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); - - WARN_ON_ONCE(!dax_dev); - if (!dax_dev) - return 0; - -#ifndef CONFIG_ARCH_HAS_PMEM_API - if (a == &dev_attr_write_cache.attr) - return 0; -#endif - return a->mode; -} - -static struct attribute *dax_attributes[] = { - &dev_attr_write_cache.attr, - NULL, -}; - -struct attribute_group dax_attribute_group = { - .name = "dax", - .attrs = dax_attributes, - .is_visible = dax_visible, -}; -EXPORT_SYMBOL_GPL(dax_attribute_group); - -/** * dax_direct_access() - translate a device pgoff to an absolute pfn * @dax_dev: a dax_device instance representing the logical memory range * @pgoff: offset in pages from the start of the device to translate * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @mode: indicator on normal access or recovery write * @kaddr: output parameter that returns a virtual address mapping of pfn * @pfn: output parameter that returns an absolute pfn translation of @pgoff * @@ -294,7 +147,7 @@ EXPORT_SYMBOL_GPL(dax_attribute_group); * pages accessible at the device relative @pgoff. */ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn) + enum dax_access_mode mode, void **kaddr, pfn_t *pfn) { long avail; @@ -305,44 +158,96 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, return -ENXIO; if (nr_pages < 0) - return nr_pages; + return -EINVAL; avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, - kaddr, pfn); + mode, kaddr, pfn); if (!avail) return -ERANGE; return min(avail, nr_pages); } EXPORT_SYMBOL_GPL(dax_direct_access); -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len) +size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) { if (!dax_alive(dax_dev)) - return false; + return 0; - return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len); + /* + * The userspace address for the memory copy has already been validated + * via access_ok() in vfs_write, so use the 'no check' version to bypass + * the HARDENED_USERCOPY overhead. + */ + if (test_bit(DAXDEV_NOCACHE, &dax_dev->flags)) + return _copy_from_iter_flushcache(addr, bytes, i); + return _copy_from_iter(addr, bytes, i); } -size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { if (!dax_alive(dax_dev)) return 0; - return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); + /* + * The userspace address for the memory copy has already been validated + * via access_ok() in vfs_red, so use the 'no check' version to bypass + * the HARDENED_USERCOPY overhead. + */ + if (test_bit(DAXDEV_NOMC, &dax_dev->flags)) + return _copy_mc_to_iter(addr, bytes, i); + return _copy_to_iter(addr, bytes, i); } -EXPORT_SYMBOL_GPL(dax_copy_from_iter); -size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, - size_t bytes, struct iov_iter *i) +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) { if (!dax_alive(dax_dev)) + return -ENXIO; + /* + * There are no callers that want to zero more than one page as of now. + * Once users are there, this check can be removed after the + * device mapper code has been updated to split ranges across targets. + */ + if (nr_pages != 1) + return -EIO; + + return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter) +{ + if (!dax_dev->ops->recovery_write) return 0; + return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); +} +EXPORT_SYMBOL_GPL(dax_recovery_write); + +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc, id; + + id = dax_read_lock(); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } - return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(id); + return rc; } -EXPORT_SYMBOL_GPL(dax_copy_to_iter); +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); @@ -375,17 +280,29 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_write_cache_enabled); -bool __dax_synchronous(struct dax_device *dax_dev) +bool dax_synchronous(struct dax_device *dax_dev) { return test_bit(DAXDEV_SYNC, &dax_dev->flags); } -EXPORT_SYMBOL_GPL(__dax_synchronous); +EXPORT_SYMBOL_GPL(dax_synchronous); -void __set_dax_synchronous(struct dax_device *dax_dev) +void set_dax_synchronous(struct dax_device *dax_dev) { set_bit(DAXDEV_SYNC, &dax_dev->flags); } -EXPORT_SYMBOL_GPL(__set_dax_synchronous); +EXPORT_SYMBOL_GPL(set_dax_synchronous); + +void set_dax_nocache(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_NOCACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(set_dax_nocache); + +void set_dax_nomc(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_NOMC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(set_dax_nomc); bool dax_alive(struct dax_device *dax_dev) { @@ -394,11 +311,6 @@ bool dax_alive(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_alive); -static int dax_host_hash(const char *host) -{ - return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; -} - /* * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring * that any fault handlers or operations that might have seen @@ -410,13 +322,15 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; - clear_bit(DAXDEV_ALIVE, &dax_dev->flags); + if (dax_dev->holder_data != NULL) + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); - spin_lock(&dax_host_lock); - hlist_del_init(&dax_dev->list); - spin_unlock(&dax_host_lock); + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -431,7 +345,7 @@ static struct inode *dax_alloc_inode(struct super_block *sb) struct dax_device *dax_dev; struct inode *inode; - dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + dax_dev = alloc_inode_sb(sb, dax_cache, GFP_KERNEL); if (!dax_dev) return NULL; @@ -448,10 +362,8 @@ static struct dax_device *to_dax_dev(struct inode *inode) static void dax_free_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); - kfree(dax_dev->host); - dax_dev->host = NULL; if (inode->i_rdev) - ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); + ida_free(&dax_minor_ida, iminor(inode)); kmem_cache_free(dax_cache, dax_dev); } @@ -524,59 +436,31 @@ static struct dax_device *dax_dev_get(dev_t devt) return dax_dev; } -static void dax_add_host(struct dax_device *dax_dev, const char *host) -{ - int hash; - - /* - * Unconditionally init dax_dev since it's coming from a - * non-zeroed slab cache - */ - INIT_HLIST_NODE(&dax_dev->list); - dax_dev->host = host; - if (!host) - return; - - hash = dax_host_hash(host); - spin_lock(&dax_host_lock); - hlist_add_head(&dax_dev->list, &dax_host_list[hash]); - spin_unlock(&dax_host_lock); -} - -struct dax_device *alloc_dax(void *private, const char *__host, - const struct dax_operations *ops, unsigned long flags) +struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { struct dax_device *dax_dev; - const char *host; dev_t devt; int minor; - host = kstrdup(__host, GFP_KERNEL); - if (__host && !host) - return NULL; + if (WARN_ON_ONCE(ops && !ops->zero_page_range)) + return ERR_PTR(-EINVAL); - minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); + minor = ida_alloc_max(&dax_minor_ida, MINORMASK, GFP_KERNEL); if (minor < 0) - goto err_minor; + return ERR_PTR(-ENOMEM); devt = MKDEV(MAJOR(dax_devt), minor); dax_dev = dax_dev_get(devt); if (!dax_dev) goto err_dev; - dax_add_host(dax_dev, host); dax_dev->ops = ops; dax_dev->private = private; - if (flags & DAXDEV_F_SYNC) - set_dax_synchronous(dax_dev); - return dax_dev; err_dev: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - kfree(host); - return NULL; + ida_free(&dax_minor_ida, minor); + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(alloc_dax); @@ -589,36 +473,17 @@ void put_dax(struct dax_device *dax_dev) EXPORT_SYMBOL_GPL(put_dax); /** - * dax_get_by_host() - temporary lookup mechanism for filesystem-dax - * @host: alternate name for the device registered by a dax driver + * dax_holder() - obtain the holder of a dax device + * @dax_dev: a dax_device instance + + * Return: the holder's data which represents the holder if registered, + * otherwize NULL. */ -struct dax_device *dax_get_by_host(const char *host) +void *dax_holder(struct dax_device *dax_dev) { - struct dax_device *dax_dev, *found = NULL; - int hash, id; - - if (!host) - return NULL; - - hash = dax_host_hash(host); - - id = dax_read_lock(); - spin_lock(&dax_host_lock); - hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { - if (!dax_alive(dax_dev) - || strcmp(host, dax_dev->host) != 0) - continue; - - if (igrab(&dax_dev->inode)) - found = dax_dev; - break; - } - spin_unlock(&dax_host_lock); - dax_read_unlock(id); - - return found; + return dax_dev->holder_data; } -EXPORT_SYMBOL_GPL(dax_get_by_host); +EXPORT_SYMBOL_GPL(dax_holder); /** * inode_dax: convert a public inode into its dax_dev @@ -687,6 +552,7 @@ static int dax_fs_init(void) static void dax_fs_exit(void) { kern_unmount(dax_mnt); + rcu_barrier(); kmem_cache_destroy(dax_cache); } @@ -716,6 +582,7 @@ err_chrdev: static void __exit dax_core_exit(void) { + dax_bus_exit(); unregister_chrdev_region(dax_devt, MINORMASK+1); ida_destroy(&dax_minor_ida); dax_fs_exit(); |