From efebc711180f7fed701f6e23f23814fcfda7fbfc Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2017 15:33:36 -0700 Subject: device-dax, tools/testing/nvdimm: enable device-dax with mock resources Provide a replacement pgoff_to_phys() that translates an nfit_test resource (allocated by vmalloc()) to a pfn. Signed-off-by: Dave Jiang Reviewed-by: Johannes Thumshirn Signed-off-by: Dan Williams --- drivers/dax/dax-private.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/dax/dax.c | 52 +++++----------------------------------- 2 files changed, 67 insertions(+), 46 deletions(-) create mode 100644 drivers/dax/dax-private.h (limited to 'drivers/dax') diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h new file mode 100644 index 000000000000..b1cd7a8e5ab9 --- /dev/null +++ b/drivers/dax/dax-private.h @@ -0,0 +1,61 @@ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __DAX_PRIVATE_H__ +#define __DAX_PRIVATE_H__ + +#include +#include + +/** + * struct dax_region - mapping infrastructure for dax devices + * @id: kernel-wide unique region for a memory range + * @base: linear address corresponding to @res + * @kref: to pin while other agents have a need to do lookups + * @dev: parent device backing this region + * @align: allocation and mapping alignment for child dax devices + * @res: physical address range of the region + * @pfn_flags: identify whether the pfns are paged back or not + */ +struct dax_region { + int id; + struct ida ida; + void *base; + struct kref kref; + struct device *dev; + unsigned int align; + struct resource res; + unsigned long pfn_flags; +}; + +/** + * struct dax_dev - subdivision of a dax region + * @region - parent region + * @inode - inode + * @dev - device backing the character device + * @cdev - core chardev data + * @alive - !alive + srcu grace period == no new mappings can be established + * @id - child id in the region + * @num_resources - number of physical address extents in this device + * @res - array of physical address ranges + */ +struct dax_dev { + struct dax_region *region; + struct inode *inode; + struct device dev; + struct cdev cdev; + bool alive; + int id; + int num_resources; + struct resource res[0]; +}; +#endif diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 806f180c80d8..ef93aa84622b 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -22,6 +22,7 @@ #include #include #include +#include "dax-private.h" #include "dax.h" static dev_t dax_devt; @@ -35,48 +36,6 @@ static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); -/** - * struct dax_region - mapping infrastructure for dax devices - * @id: kernel-wide unique region for a memory range - * @base: linear address corresponding to @res - * @kref: to pin while other agents have a need to do lookups - * @dev: parent device backing this region - * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not - */ -struct dax_region { - int id; - struct ida ida; - void *base; - struct kref kref; - struct device *dev; - unsigned int align; - struct resource res; - unsigned long pfn_flags; -}; - -/** - * struct dax_dev - subdivision of a dax region - * @region - parent region - * @dev - device backing the character device - * @cdev - core chardev data - * @alive - !alive + srcu grace period == no new mappings can be established - * @id - child id in the region - * @num_resources - number of physical address extents in this device - * @res - array of physical address ranges - */ -struct dax_dev { - struct dax_region *region; - struct inode *inode; - struct device dev; - struct cdev cdev; - bool alive; - int id; - int num_resources; - struct resource res[0]; -}; - static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -397,7 +356,8 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, return 0; } -static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, +/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ +__weak phys_addr_t dax_pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, unsigned long size) { struct resource *res; @@ -442,7 +402,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) if (fault_size != dax_region->align) return VM_FAULT_SIGBUS; - phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); + phys = dax_pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, vmf->pgoff); @@ -497,7 +457,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pmd_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); + phys = dax_pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); @@ -548,7 +508,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pud_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); + phys = dax_pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); -- cgit v1.2.3-59-g8ed1b From 54eafcc9e339affb8982fd21e1fc4aa4a036655b Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Tue, 11 Apr 2017 09:12:25 -0700 Subject: device-dax: fix dax_dev_huge_fault() unknown fault size handling The default case for dax_dev_huge_fault() fault size handling mistakenly returns when it should unlock. This is not a problem in practice since the only three possible fault sizes are handled. Going forward, if the core mm adds a new fault size beyond pte, pmd, or pud device-dax should abort VM_FAULT_SIGBUS requests not VM_FAULT_FALLBACK since device-dax guarantees a configured fault granularity for all faults. Signed-off-by: Pushkar Jambhlekar Signed-off-by: Dan Williams --- drivers/dax/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 19795eb35579..94036d92ed16 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -591,7 +591,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, rc = __dax_dev_pud_fault(dax_dev, vmf); break; default: - return VM_FAULT_FALLBACK; + rc = VM_FAULT_SIGBUS; } srcu_read_unlock(&dax_srcu, id); -- cgit v1.2.3-59-g8ed1b From 762026203c0b87b1088342b664e67ca7c45fb7c4 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 12 Apr 2017 01:59:36 +1000 Subject: device-dax: improve fault handler debug output A couple of minor improvements to the debug output in the fault handlers: a) Print the region alignment and fault size when we sent a SIGBUS because the region alignment is greater than the fault size. b) Fix the message in the PFN_{DEV|MAP} check. c) Additionally print the fault size enum value in the huge fault handler. Signed-off-by: Oliver O'Halloran Signed-off-by: Dan Williams --- drivers/dax/dax.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 94036d92ed16..352cc54056ce 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -435,7 +435,8 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) dax_region = dax_dev->region; if (dax_region->align > PAGE_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } @@ -476,13 +477,14 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) dax_region = dax_dev->region; if (dax_region->align > PMD_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } /* dax pmd mappings require pfn_t_devmap() */ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); return VM_FAULT_SIGBUS; } @@ -527,13 +529,14 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) dax_region = dax_dev->region; if (dax_region->align > PUD_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } /* dax pud mappings require pfn_t_devmap() */ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); return VM_FAULT_SIGBUS; } @@ -574,10 +577,10 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, struct file *filp = vmf->vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end); + vmf->vma->vm_start, vmf->vma->vm_end, pe_size); id = srcu_read_lock(&dax_srcu); switch (pe_size) { -- cgit v1.2.3-59-g8ed1b From 5f0694b300b9fb8409272c550418c22e0e57314a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Jan 2017 21:43:10 -0800 Subject: device-dax: rename 'dax_dev' to 'dev_dax' In preparation for introducing a struct dax_device type to the kernel global type namespace, rename dax_dev to dev_dax. A 'dax_device' instance will be a generic device-driver object for any provider of dax functionality. A 'dev_dax' object is a device-dax-driver local / internal instance. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 206 ++++++++++++++++++++++++++--------------------------- drivers/dax/dax.h | 4 +- drivers/dax/pmem.c | 8 +-- 3 files changed, 109 insertions(+), 109 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 352cc54056ce..376fdd353aea 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -57,7 +57,7 @@ struct dax_region { }; /** - * struct dax_dev - subdivision of a dax region + * struct dev_dax - instance data for a subdivision of a dax region * @region - parent region * @dev - device backing the character device * @cdev - core chardev data @@ -66,7 +66,7 @@ struct dax_region { * @num_resources - number of physical address extents in this device * @res - array of physical address ranges */ -struct dax_dev { +struct dev_dax { struct dax_region *region; struct inode *inode; struct device dev; @@ -323,47 +323,47 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); -static struct dax_dev *to_dax_dev(struct device *dev) +static struct dev_dax *to_dev_dax(struct device *dev) { - return container_of(dev, struct dax_dev, dev); + return container_of(dev, struct dev_dax, dev); } static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = to_dax_dev(dev); + struct dev_dax *dev_dax = to_dev_dax(dev); unsigned long long size = 0; int i; - for (i = 0; i < dax_dev->num_resources; i++) - size += resource_size(&dax_dev->res[i]); + for (i = 0; i < dev_dax->num_resources; i++) + size += resource_size(&dev_dax->res[i]); return sprintf(buf, "%llu\n", size); } static DEVICE_ATTR_RO(size); -static struct attribute *dax_device_attributes[] = { +static struct attribute *dev_dax_attributes[] = { &dev_attr_size.attr, NULL, }; -static const struct attribute_group dax_device_attribute_group = { - .attrs = dax_device_attributes, +static const struct attribute_group dev_dax_attribute_group = { + .attrs = dev_dax_attributes, }; static const struct attribute_group *dax_attribute_groups[] = { - &dax_device_attribute_group, + &dev_dax_attribute_group, NULL, }; -static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - struct dax_region *dax_region = dax_dev->region; - struct device *dev = &dax_dev->dev; + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; unsigned long mask; - if (!dax_dev->alive) + if (!dev_dax->alive) return -ENXIO; /* prevent private mappings from being established */ @@ -397,23 +397,23 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, return 0; } -static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, +static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { struct resource *res; phys_addr_t phys; int i; - for (i = 0; i < dax_dev->num_resources; i++) { - res = &dax_dev->res[i]; + for (i = 0; i < dev_dax->num_resources; i++) { + res = &dev_dax->res[i]; phys = pgoff * PAGE_SIZE + res->start; if (phys >= res->start && phys <= res->end) break; pgoff -= PHYS_PFN(resource_size(res)); } - if (i < dax_dev->num_resources) { - res = &dax_dev->res[i]; + if (i < dev_dax->num_resources) { + res = &dev_dax->res[i]; if (phys + size - 1 <= res->end) return phys; } @@ -421,19 +421,19 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, return -1; } -static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; pfn_t pfn; unsigned int fault_size = PAGE_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PAGE_SIZE) { dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", __func__, dax_region->align, fault_size); @@ -443,7 +443,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) if (fault_size != dax_region->align) return VM_FAULT_SIGBUS; - phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); + phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, vmf->pgoff); @@ -462,20 +462,20 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_NOPAGE; } -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; unsigned int fault_size = PMD_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PMD_SIZE) { dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", __func__, dax_region->align, fault_size); @@ -499,7 +499,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pmd_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); + phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); @@ -513,10 +513,10 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -524,10 +524,10 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) unsigned int fault_size = PUD_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PUD_SIZE) { dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", __func__, dax_region->align, fault_size); @@ -551,7 +551,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pud_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); + phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); @@ -564,20 +564,20 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) vmf->flags & FAULT_FLAG_WRITE); } #else -static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static int dax_dev_huge_fault(struct vm_fault *vmf, +static int dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int rc, id; struct file *filp = vmf->vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; + struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, + dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vmf->vma->vm_start, vmf->vma->vm_end, pe_size); @@ -585,13 +585,13 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, id = srcu_read_lock(&dax_srcu); switch (pe_size) { case PE_SIZE_PTE: - rc = __dax_dev_pte_fault(dax_dev, vmf); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - rc = __dax_dev_pmd_fault(dax_dev, vmf); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - rc = __dax_dev_pud_fault(dax_dev, vmf); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: rc = VM_FAULT_SIGBUS; @@ -601,28 +601,28 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, return rc; } -static int dax_dev_fault(struct vm_fault *vmf) +static int dev_dax_fault(struct vm_fault *vmf) { - return dax_dev_huge_fault(vmf, PE_SIZE_PTE); + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); } -static const struct vm_operations_struct dax_dev_vm_ops = { - .fault = dax_dev_fault, - .huge_fault = dax_dev_huge_fault, +static const struct vm_operations_struct dax_vm_ops = { + .fault = dev_dax_fault, + .huge_fault = dev_dax_huge_fault, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { - struct dax_dev *dax_dev = filp->private_data; + struct dev_dax *dev_dax = filp->private_data; int rc; - dev_dbg(&dax_dev->dev, "%s\n", __func__); + dev_dbg(&dev_dax->dev, "%s\n", __func__); - rc = check_vma(dax_dev, vma, __func__); + rc = check_vma(dev_dax, vma, __func__); if (rc) return rc; - vma->vm_ops = &dax_dev_vm_ops; + vma->vm_ops = &dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } @@ -633,13 +633,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, unsigned long flags) { unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dev_dax *dev_dax = filp ? filp->private_data : NULL; struct dax_region *dax_region; - if (!dax_dev || addr) + if (!dev_dax || addr) goto out; - dax_region = dax_dev->region; + dax_region = dev_dax->region; align = dax_region->align; off = pgoff << PAGE_SHIFT; off_end = off + len; @@ -664,14 +664,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp, static int dax_open(struct inode *inode, struct file *filp) { - struct dax_dev *dax_dev; + struct dev_dax *dev_dax; - dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); - dev_dbg(&dax_dev->dev, "%s\n", __func__); - inode->i_mapping = dax_dev->inode->i_mapping; - inode->i_mapping->host = dax_dev->inode; + dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev); + dev_dbg(&dev_dax->dev, "%s\n", __func__); + inode->i_mapping = dev_dax->inode->i_mapping; + inode->i_mapping->host = dev_dax->inode; filp->f_mapping = inode->i_mapping; - filp->private_data = dax_dev; + filp->private_data = dev_dax; inode->i_flags = S_DAX; return 0; @@ -679,9 +679,9 @@ static int dax_open(struct inode *inode, struct file *filp) static int dax_release(struct inode *inode, struct file *filp) { - struct dax_dev *dax_dev = filp->private_data; + struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dax_dev->dev, "%s\n", __func__); + dev_dbg(&dev_dax->dev, "%s\n", __func__); return 0; } @@ -694,55 +694,55 @@ static const struct file_operations dax_fops = { .mmap = dax_mmap, }; -static void dax_dev_release(struct device *dev) +static void dev_dax_release(struct device *dev) { - struct dax_dev *dax_dev = to_dax_dev(dev); - struct dax_region *dax_region = dax_dev->region; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; - ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_region->ida, dev_dax->id); ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); dax_region_put(dax_region); - iput(dax_dev->inode); - kfree(dax_dev); + iput(dev_dax->inode); + kfree(dev_dax); } -static void kill_dax_dev(struct dax_dev *dax_dev) +static void kill_dev_dax(struct dev_dax *dev_dax) { /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is + * Note, rcu is not protecting the liveness of dev_dax, rcu is * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers + * dev_dax->alive == true, have completed. Any fault handlers * that start after synchronize_srcu() has started will abort - * upon seeing dax_dev->alive == false. + * upon seeing dev_dax->alive == false. */ - dax_dev->alive = false; + dev_dax->alive = false; synchronize_srcu(&dax_srcu); - unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); + unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1); } -static void unregister_dax_dev(void *dev) +static void unregister_dev_dax(void *dev) { - struct dax_dev *dax_dev = to_dax_dev(dev); + struct dev_dax *dev_dax = to_dev_dax(dev); dev_dbg(dev, "%s\n", __func__); - kill_dax_dev(dax_dev); - cdev_device_del(&dax_dev->cdev, dev); + kill_dev_dax(dev_dax); + cdev_device_del(&dev_dax->cdev, dev); put_device(dev); } -struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, struct resource *res, int count) { struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; + struct dev_dax *dev_dax; int rc = 0, minor, i; struct device *dev; struct cdev *cdev; dev_t dev_t; - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) + dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); + if (!dev_dax) return ERR_PTR(-ENOMEM); for (i = 0; i < count; i++) { @@ -752,16 +752,16 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, rc = -EINVAL; break; } - dax_dev->res[i].start = res[i].start; - dax_dev->res[i].end = res[i].end; + dev_dax->res[i].start = res[i].start; + dev_dax->res[i].end = res[i].end; } if (i < count) goto err_id; - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; + dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dev_dax->id < 0) { + rc = dev_dax->id; goto err_id; } @@ -772,55 +772,55 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, } dev_t = MKDEV(MAJOR(dax_devt), minor); - dev = &dax_dev->dev; - dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); - if (!dax_dev->inode) { + dev = &dev_dax->dev; + dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t); + if (!dev_dax->inode) { rc = -ENOMEM; goto err_inode; } - /* from here on we're committed to teardown via dax_dev_release() */ + /* from here on we're committed to teardown via dev_dax_release() */ device_initialize(dev); - cdev = &dax_dev->cdev; + cdev = &dev_dax->cdev; cdev_init(cdev, &dax_fops); cdev->owner = parent->driver->owner; - dax_dev->num_resources = count; - dax_dev->alive = true; - dax_dev->region = dax_region; + dev_dax->num_resources = count; + dev_dax->alive = true; + dev_dax->region = dax_region; kref_get(&dax_region->kref); dev->devt = dev_t; dev->class = dax_class; dev->parent = parent; dev->groups = dax_attribute_groups; - dev->release = dax_dev_release; - dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + dev->release = dev_dax_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); rc = cdev_device_add(cdev, dev); if (rc) { - kill_dax_dev(dax_dev); + kill_dev_dax(dev_dax); put_device(dev); return ERR_PTR(rc); } - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); if (rc) return ERR_PTR(rc); - return dax_dev; + return dev_dax; err_inode: ida_simple_remove(&dax_minor_ida, minor); err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_region->ida, dev_dax->id); err_id: - kfree(dax_dev); + kfree(dev_dax); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(devm_create_dax_dev); +EXPORT_SYMBOL_GPL(devm_create_dev_dax); static int __init dax_init(void) { diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index ddd829ab58c0..ea176d875d60 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,13 +13,13 @@ #ifndef __DAX_H__ #define __DAX_H__ struct device; -struct dax_dev; +struct dev_dax; struct resource; struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long flags); -struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, struct resource *res, int count); #endif /* __DAX_H__ */ diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 033f49b31fdc..2c736fc4508b 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -61,8 +61,8 @@ static int dax_pmem_probe(struct device *dev) int rc; void *addr; struct resource res; - struct dax_dev *dax_dev; struct nd_pfn_sb *pfn_sb; + struct dev_dax *dev_dax; struct dax_pmem *dax_pmem; struct nd_region *nd_region; struct nd_namespace_io *nsio; @@ -130,12 +130,12 @@ static int dax_pmem_probe(struct device *dev) return -ENOMEM; /* TODO: support for subdividing a dax region... */ - dax_dev = devm_create_dax_dev(dax_region, &res, 1); + dev_dax = devm_create_dev_dax(dax_region, &res, 1); - /* child dax_dev instances now own the lifetime of the dax_region */ + /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return PTR_ERR_OR_ZERO(dax_dev); + return PTR_ERR_OR_ZERO(dev_dax); } static struct nd_device_driver dax_pmem_driver = { -- cgit v1.2.3-59-g8ed1b From 7b6be8444e0f0dd675b54d059793423d3c9b4c03 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 11 Apr 2017 09:49:49 -0700 Subject: dax: refactor dax-fs into a generic provider of 'struct dax_device' instances We want dax capable drivers to be able to publish a set of dax operations [1]. However, we do not want to further abuse block_devices to advertise these operations. Instead we will attach these operations to a dax device and add a lookup mechanism to go from block device path to a dax device. A dax capable driver like pmem or brd is responsible for registering a dax device, alongside a block device, and then a dax capable filesystem is responsible for retrieving the dax device by path name if it wants to call dax_operations. For now, we refactor the dax pseudo-fs to be a generic facility, rather than an implementation detail, of the device-dax use case. Where a "dax device" is just an inode + dax infrastructure, and "Device DAX" is a mapping service layered on top of that base 'struct dax_device'. "Filesystem DAX" is then a mapping service that layers a filesystem on top of that same base device. Filesystem DAX is associated with a block_device for now, but perhaps directly to a dax device in the future, or for new pmem-only filesystems. [1]: https://lkml.org/lkml/2017/1/19/880 Suggested-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/Makefile | 2 +- drivers/dax/Kconfig | 10 +- drivers/dax/Makefile | 5 +- drivers/dax/dax.c | 864 -------------------------------------------- drivers/dax/dax.h | 20 +- drivers/dax/device-dax.h | 25 ++ drivers/dax/device.c | 709 ++++++++++++++++++++++++++++++++++++ drivers/dax/pmem.c | 2 +- drivers/dax/super.c | 303 ++++++++++++++++ include/linux/dax.h | 3 + tools/testing/nvdimm/Kbuild | 10 +- 11 files changed, 1070 insertions(+), 883 deletions(-) delete mode 100644 drivers/dax/dax.c create mode 100644 drivers/dax/device-dax.h create mode 100644 drivers/dax/device.c create mode 100644 drivers/dax/super.c (limited to 'drivers/dax') diff --git a/drivers/Makefile b/drivers/Makefile index 2eced9afba53..0442e982cf35 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT) += parport/ obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ -obj-$(CONFIG_DEV_DAX) += dax/ +obj-$(CONFIG_DAX) += dax/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 9e95bf94eb13..b7053eafd88e 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -1,8 +1,13 @@ -menuconfig DEV_DAX +menuconfig DAX tristate "DAX: direct access to differentiated memory" + select SRCU default m if NVDIMM_DAX + +if DAX + +config DEV_DAX + tristate "Device DAX: direct access mapping device" depends on TRANSPARENT_HUGEPAGE - select SRCU help Support raw access to differentiated (persistence, bandwidth, latency...) memory via an mmap(2) capable character @@ -11,7 +16,6 @@ menuconfig DEV_DAX baseline memory pool. Mappings of a /dev/daxX.Y device impose restrictions that make the mapping behavior deterministic. -if DEV_DAX config DEV_DAX_PMEM tristate "PMEM DAX: direct access to persistent memory" diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 27c54e38478a..dc7422530462 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,4 +1,7 @@ -obj-$(CONFIG_DEV_DAX) += dax.o +obj-$(CONFIG_DAX) += dax.o +obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o +dax-y := super.o dax_pmem-y := pmem.o +device_dax-y := device.o diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c deleted file mode 100644 index 376fdd353aea..000000000000 --- a/drivers/dax/dax.c +++ /dev/null @@ -1,864 +0,0 @@ -/* - * Copyright(c) 2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "dax.h" - -static dev_t dax_devt; -DEFINE_STATIC_SRCU(dax_srcu); -static struct class *dax_class; -static DEFINE_IDA(dax_minor_ida); -static int nr_dax = CONFIG_NR_DEV_DAX; -module_param(nr_dax, int, S_IRUGO); -static struct vfsmount *dax_mnt; -static struct kmem_cache *dax_cache __read_mostly; -static struct super_block *dax_superblock __read_mostly; -MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); - -/** - * struct dax_region - mapping infrastructure for dax devices - * @id: kernel-wide unique region for a memory range - * @base: linear address corresponding to @res - * @kref: to pin while other agents have a need to do lookups - * @dev: parent device backing this region - * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not - */ -struct dax_region { - int id; - struct ida ida; - void *base; - struct kref kref; - struct device *dev; - unsigned int align; - struct resource res; - unsigned long pfn_flags; -}; - -/** - * struct dev_dax - instance data for a subdivision of a dax region - * @region - parent region - * @dev - device backing the character device - * @cdev - core chardev data - * @alive - !alive + srcu grace period == no new mappings can be established - * @id - child id in the region - * @num_resources - number of physical address extents in this device - * @res - array of physical address ranges - */ -struct dev_dax { - struct dax_region *region; - struct inode *inode; - struct device dev; - struct cdev cdev; - bool alive; - int id; - int num_resources; - struct resource res[0]; -}; - -static ssize_t id_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%d\n", dax_region->id); - device_unlock(dev); - - return rc; -} -static DEVICE_ATTR_RO(id); - -static ssize_t region_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); - device_unlock(dev); - - return rc; -} -static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, - region_size_show, NULL); - -static ssize_t align_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%u\n", dax_region->align); - device_unlock(dev); - - return rc; -} -static DEVICE_ATTR_RO(align); - -static struct attribute *dax_region_attributes[] = { - &dev_attr_region_size.attr, - &dev_attr_align.attr, - &dev_attr_id.attr, - NULL, -}; - -static const struct attribute_group dax_region_attribute_group = { - .name = "dax_region", - .attrs = dax_region_attributes, -}; - -static const struct attribute_group *dax_region_attribute_groups[] = { - &dax_region_attribute_group, - NULL, -}; - -static struct inode *dax_alloc_inode(struct super_block *sb) -{ - return kmem_cache_alloc(dax_cache, GFP_KERNEL); -} - -static void dax_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - - kmem_cache_free(dax_cache, inode); -} - -static void dax_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, dax_i_callback); -} - -static const struct super_operations dax_sops = { - .statfs = simple_statfs, - .alloc_inode = dax_alloc_inode, - .destroy_inode = dax_destroy_inode, - .drop_inode = generic_delete_inode, -}; - -static struct dentry *dax_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); -} - -static struct file_system_type dax_type = { - .name = "dax", - .mount = dax_mount, - .kill_sb = kill_anon_super, -}; - -static int dax_test(struct inode *inode, void *data) -{ - return inode->i_cdev == data; -} - -static int dax_set(struct inode *inode, void *data) -{ - inode->i_cdev = data; - return 0; -} - -static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) -{ - struct inode *inode; - - inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), - dax_test, dax_set, cdev); - - if (!inode) - return NULL; - - if (inode->i_state & I_NEW) { - inode->i_mode = S_IFCHR; - inode->i_flags = S_DAX; - inode->i_rdev = devt; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - unlock_new_inode(inode); - } - return inode; -} - -static void init_once(void *inode) -{ - inode_init_once(inode); -} - -static int dax_inode_init(void) -{ - int rc; - - dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); - if (!dax_cache) - return -ENOMEM; - - rc = register_filesystem(&dax_type); - if (rc) - goto err_register_fs; - - dax_mnt = kern_mount(&dax_type); - if (IS_ERR(dax_mnt)) { - rc = PTR_ERR(dax_mnt); - goto err_mount; - } - dax_superblock = dax_mnt->mnt_sb; - - return 0; - - err_mount: - unregister_filesystem(&dax_type); - err_register_fs: - kmem_cache_destroy(dax_cache); - - return rc; -} - -static void dax_inode_exit(void) -{ - kern_unmount(dax_mnt); - unregister_filesystem(&dax_type); - kmem_cache_destroy(dax_cache); -} - -static void dax_region_free(struct kref *kref) -{ - struct dax_region *dax_region; - - dax_region = container_of(kref, struct dax_region, kref); - kfree(dax_region); -} - -void dax_region_put(struct dax_region *dax_region) -{ - kref_put(&dax_region->kref, dax_region_free); -} -EXPORT_SYMBOL_GPL(dax_region_put); - -static void dax_region_unregister(void *region) -{ - struct dax_region *dax_region = region; - - sysfs_remove_groups(&dax_region->dev->kobj, - dax_region_attribute_groups); - dax_region_put(dax_region); -} - -struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, void *addr, - unsigned long pfn_flags) -{ - struct dax_region *dax_region; - - /* - * The DAX core assumes that it can store its private data in - * parent->driver_data. This WARN is a reminder / safeguard for - * developers of device-dax drivers. - */ - if (dev_get_drvdata(parent)) { - dev_WARN(parent, "dax core failed to setup private data\n"); - return NULL; - } - - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) - return NULL; - - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); - if (!dax_region) - return NULL; - - dev_set_drvdata(parent, dax_region); - memcpy(&dax_region->res, res, sizeof(*res)); - dax_region->pfn_flags = pfn_flags; - kref_init(&dax_region->kref); - dax_region->id = region_id; - ida_init(&dax_region->ida); - dax_region->align = align; - dax_region->dev = parent; - dax_region->base = addr; - if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - kfree(dax_region); - return NULL;; - } - - kref_get(&dax_region->kref); - if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) - return NULL; - return dax_region; -} -EXPORT_SYMBOL_GPL(alloc_dax_region); - -static struct dev_dax *to_dev_dax(struct device *dev) -{ - return container_of(dev, struct dev_dax, dev); -} - -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = 0; - int i; - - for (i = 0; i < dev_dax->num_resources; i++) - size += resource_size(&dev_dax->res[i]); - - return sprintf(buf, "%llu\n", size); -} -static DEVICE_ATTR_RO(size); - -static struct attribute *dev_dax_attributes[] = { - &dev_attr_size.attr, - NULL, -}; - -static const struct attribute_group dev_dax_attribute_group = { - .attrs = dev_dax_attributes, -}; - -static const struct attribute_group *dax_attribute_groups[] = { - &dev_dax_attribute_group, - NULL, -}; - -static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, - const char *func) -{ - struct dax_region *dax_region = dev_dax->region; - struct device *dev = &dev_dax->dev; - unsigned long mask; - - if (!dev_dax->alive) - return -ENXIO; - - /* prevent private mappings from being established */ - if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { - dev_info(dev, "%s: %s: fail, attempted private mapping\n", - current->comm, func); - return -EINVAL; - } - - mask = dax_region->align - 1; - if (vma->vm_start & mask || vma->vm_end & mask) { - dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", - current->comm, func, vma->vm_start, vma->vm_end, - mask); - return -EINVAL; - } - - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV - && (vma->vm_flags & VM_DONTCOPY) == 0) { - dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", - current->comm, func); - return -EINVAL; - } - - if (!vma_is_dax(vma)) { - dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", - current->comm, func); - return -EINVAL; - } - - return 0; -} - -static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, - unsigned long size) -{ - struct resource *res; - phys_addr_t phys; - int i; - - for (i = 0; i < dev_dax->num_resources; i++) { - res = &dev_dax->res[i]; - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) - break; - pgoff -= PHYS_PFN(resource_size(res)); - } - - if (i < dev_dax->num_resources) { - res = &dev_dax->res[i]; - if (phys + size - 1 <= res->end) - return phys; - } - - return -1; -} - -static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) -{ - struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; - int rc = VM_FAULT_SIGBUS; - phys_addr_t phys; - pfn_t pfn; - unsigned int fault_size = PAGE_SIZE; - - if (check_vma(dev_dax, vmf->vma, __func__)) - return VM_FAULT_SIGBUS; - - dax_region = dev_dax->region; - if (dax_region->align > PAGE_SIZE) { - dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", - __func__, dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - if (fault_size != dax_region->align) - return VM_FAULT_SIGBUS; - - phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); - if (phys == -1) { - dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, - vmf->pgoff); - return VM_FAULT_SIGBUS; - } - - pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - - rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); - - if (rc == -ENOMEM) - return VM_FAULT_OOM; - if (rc < 0 && rc != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - -static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) -{ - unsigned long pmd_addr = vmf->address & PMD_MASK; - struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; - phys_addr_t phys; - pgoff_t pgoff; - pfn_t pfn; - unsigned int fault_size = PMD_SIZE; - - if (check_vma(dev_dax, vmf->vma, __func__)) - return VM_FAULT_SIGBUS; - - dax_region = dev_dax->region; - if (dax_region->align > PMD_SIZE) { - dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", - __func__, dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - /* dax pmd mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); - return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) - return VM_FAULT_FALLBACK; - - /* if we are outside of the VMA */ - if (pmd_addr < vmf->vma->vm_start || - (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) - return VM_FAULT_SIGBUS; - - pgoff = linear_page_index(vmf->vma, pmd_addr); - phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); - if (phys == -1) { - dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, - pgoff); - return VM_FAULT_SIGBUS; - } - - pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, - vmf->flags & FAULT_FLAG_WRITE); -} - -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) -{ - unsigned long pud_addr = vmf->address & PUD_MASK; - struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; - phys_addr_t phys; - pgoff_t pgoff; - pfn_t pfn; - unsigned int fault_size = PUD_SIZE; - - - if (check_vma(dev_dax, vmf->vma, __func__)) - return VM_FAULT_SIGBUS; - - dax_region = dev_dax->region; - if (dax_region->align > PUD_SIZE) { - dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", - __func__, dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - /* dax pud mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); - return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) - return VM_FAULT_FALLBACK; - - /* if we are outside of the VMA */ - if (pud_addr < vmf->vma->vm_start || - (pud_addr + PUD_SIZE) > vmf->vma->vm_end) - return VM_FAULT_SIGBUS; - - pgoff = linear_page_index(vmf->vma, pud_addr); - phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); - if (phys == -1) { - dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, - pgoff); - return VM_FAULT_SIGBUS; - } - - pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - - return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, - vmf->flags & FAULT_FLAG_WRITE); -} -#else -static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) -{ - return VM_FAULT_FALLBACK; -} -#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ - -static int dev_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) -{ - int rc, id; - struct file *filp = vmf->vma->vm_file; - struct dev_dax *dev_dax = filp->private_data; - - dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, - current->comm, (vmf->flags & FAULT_FLAG_WRITE) - ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end, pe_size); - - id = srcu_read_lock(&dax_srcu); - switch (pe_size) { - case PE_SIZE_PTE: - rc = __dev_dax_pte_fault(dev_dax, vmf); - break; - case PE_SIZE_PMD: - rc = __dev_dax_pmd_fault(dev_dax, vmf); - break; - case PE_SIZE_PUD: - rc = __dev_dax_pud_fault(dev_dax, vmf); - break; - default: - rc = VM_FAULT_SIGBUS; - } - srcu_read_unlock(&dax_srcu, id); - - return rc; -} - -static int dev_dax_fault(struct vm_fault *vmf) -{ - return dev_dax_huge_fault(vmf, PE_SIZE_PTE); -} - -static const struct vm_operations_struct dax_vm_ops = { - .fault = dev_dax_fault, - .huge_fault = dev_dax_huge_fault, -}; - -static int dax_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct dev_dax *dev_dax = filp->private_data; - int rc; - - dev_dbg(&dev_dax->dev, "%s\n", __func__); - - rc = check_vma(dev_dax, vma, __func__); - if (rc) - return rc; - - vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; - return 0; -} - -/* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dev_dax *dev_dax = filp ? filp->private_data : NULL; - struct dax_region *dax_region; - - if (!dev_dax || addr) - goto out; - - dax_region = dev_dax->region; - align = dax_region->align; - off = pgoff << PAGE_SHIFT; - off_end = off + len; - off_align = round_up(off, align); - - if ((off_end <= off_align) || ((off_end - off_align) < align)) - goto out; - - len_align = len + align; - if ((off + len_align) < off) - goto out; - - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); - if (!IS_ERR_VALUE(addr_align)) { - addr_align += (off - addr_align) & (align - 1); - return addr_align; - } - out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -static int dax_open(struct inode *inode, struct file *filp) -{ - struct dev_dax *dev_dax; - - dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev); - dev_dbg(&dev_dax->dev, "%s\n", __func__); - inode->i_mapping = dev_dax->inode->i_mapping; - inode->i_mapping->host = dev_dax->inode; - filp->f_mapping = inode->i_mapping; - filp->private_data = dev_dax; - inode->i_flags = S_DAX; - - return 0; -} - -static int dax_release(struct inode *inode, struct file *filp) -{ - struct dev_dax *dev_dax = filp->private_data; - - dev_dbg(&dev_dax->dev, "%s\n", __func__); - return 0; -} - -static const struct file_operations dax_fops = { - .llseek = noop_llseek, - .owner = THIS_MODULE, - .open = dax_open, - .release = dax_release, - .get_unmapped_area = dax_get_unmapped_area, - .mmap = dax_mmap, -}; - -static void dev_dax_release(struct device *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - struct dax_region *dax_region = dev_dax->region; - - ida_simple_remove(&dax_region->ida, dev_dax->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - dax_region_put(dax_region); - iput(dev_dax->inode); - kfree(dev_dax); -} - -static void kill_dev_dax(struct dev_dax *dev_dax) -{ - /* - * Note, rcu is not protecting the liveness of dev_dax, rcu is - * ensuring that any fault handlers that might have seen - * dev_dax->alive == true, have completed. Any fault handlers - * that start after synchronize_srcu() has started will abort - * upon seeing dev_dax->alive == false. - */ - dev_dax->alive = false; - synchronize_srcu(&dax_srcu); - unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1); -} - -static void unregister_dev_dax(void *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - - dev_dbg(dev, "%s\n", __func__); - - kill_dev_dax(dev_dax); - cdev_device_del(&dev_dax->cdev, dev); - put_device(dev); -} - -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - struct resource *res, int count) -{ - struct device *parent = dax_region->dev; - struct dev_dax *dev_dax; - int rc = 0, minor, i; - struct device *dev; - struct cdev *cdev; - dev_t dev_t; - - dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); - if (!dev_dax) - return ERR_PTR(-ENOMEM); - - for (i = 0; i < count; i++) { - if (!IS_ALIGNED(res[i].start, dax_region->align) - || !IS_ALIGNED(resource_size(&res[i]), - dax_region->align)) { - rc = -EINVAL; - break; - } - dev_dax->res[i].start = res[i].start; - dev_dax->res[i].end = res[i].end; - } - - if (i < count) - goto err_id; - - dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dev_dax->id < 0) { - rc = dev_dax->id; - goto err_id; - } - - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(MAJOR(dax_devt), minor); - dev = &dev_dax->dev; - dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t); - if (!dev_dax->inode) { - rc = -ENOMEM; - goto err_inode; - } - - /* from here on we're committed to teardown via dev_dax_release() */ - device_initialize(dev); - - cdev = &dev_dax->cdev; - cdev_init(cdev, &dax_fops); - cdev->owner = parent->driver->owner; - - dev_dax->num_resources = count; - dev_dax->alive = true; - dev_dax->region = dax_region; - kref_get(&dax_region->kref); - - dev->devt = dev_t; - dev->class = dax_class; - dev->parent = parent; - dev->groups = dax_attribute_groups; - dev->release = dev_dax_release; - dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); - - rc = cdev_device_add(cdev, dev); - if (rc) { - kill_dev_dax(dev_dax); - put_device(dev); - return ERR_PTR(rc); - } - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); - if (rc) - return ERR_PTR(rc); - - return dev_dax; - - err_inode: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dev_dax->id); - err_id: - kfree(dev_dax); - - return ERR_PTR(rc); -} -EXPORT_SYMBOL_GPL(devm_create_dev_dax); - -static int __init dax_init(void) -{ - int rc; - - rc = dax_inode_init(); - if (rc) - return rc; - - nr_dax = max(nr_dax, 256); - rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); - if (rc) - goto err_chrdev; - - dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) { - rc = PTR_ERR(dax_class); - goto err_class; - } - - return 0; - - err_class: - unregister_chrdev_region(dax_devt, nr_dax); - err_chrdev: - dax_inode_exit(); - return rc; -} - -static void __exit dax_exit(void) -{ - class_destroy(dax_class); - unregister_chrdev_region(dax_devt, nr_dax); - ida_destroy(&dax_minor_ida); - dax_inode_exit(); -} - -MODULE_AUTHOR("Intel Corporation"); -MODULE_LICENSE("GPL v2"); -subsys_initcall(dax_init); -module_exit(dax_exit); diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index ea176d875d60..2472d9da96db 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. All rights reserved. + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -12,14 +12,12 @@ */ #ifndef __DAX_H__ #define __DAX_H__ -struct device; -struct dev_dax; -struct resource; -struct dax_region; -void dax_region_put(struct dax_region *dax_region); -struct dax_region *alloc_dax_region(struct device *parent, - int region_id, struct resource *res, unsigned int align, - void *addr, unsigned long flags); -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - struct resource *res, int count); +struct dax_device; +struct dax_device *alloc_dax(void *private); +void put_dax(struct dax_device *dax_dev); +bool dax_alive(struct dax_device *dax_dev); +void kill_dax(struct dax_device *dax_dev); +struct dax_device *inode_dax(struct inode *inode); +struct inode *dax_inode(struct dax_device *dax_dev); +void *dax_get_private(struct dax_device *dax_dev); #endif /* __DAX_H__ */ diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h new file mode 100644 index 000000000000..fdcd9769ffde --- /dev/null +++ b/drivers/dax/device-dax.h @@ -0,0 +1,25 @@ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __DEVICE_DAX_H__ +#define __DEVICE_DAX_H__ +struct device; +struct dev_dax; +struct resource; +struct dax_region; +void dax_region_put(struct dax_region *dax_region); +struct dax_region *alloc_dax_region(struct device *parent, + int region_id, struct resource *res, unsigned int align, + void *addr, unsigned long flags); +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, + struct resource *res, int count); +#endif /* __DEVICE_DAX_H__ */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c new file mode 100644 index 000000000000..19a42edbfa03 --- /dev/null +++ b/drivers/dax/device.c @@ -0,0 +1,709 @@ +/* + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax.h" + +static struct class *dax_class; + +/** + * struct dax_region - mapping infrastructure for dax devices + * @id: kernel-wide unique region for a memory range + * @base: linear address corresponding to @res + * @kref: to pin while other agents have a need to do lookups + * @dev: parent device backing this region + * @align: allocation and mapping alignment for child dax devices + * @res: physical address range of the region + * @pfn_flags: identify whether the pfns are paged back or not + */ +struct dax_region { + int id; + struct ida ida; + void *base; + struct kref kref; + struct device *dev; + unsigned int align; + struct resource res; + unsigned long pfn_flags; +}; + +/** + * struct dev_dax - instance data for a subdivision of a dax region + * @region - parent region + * @dax_dev - core dax functionality + * @dev - device core + * @id - child id in the region + * @num_resources - number of physical address extents in this device + * @res - array of physical address ranges + */ +struct dev_dax { + struct dax_region *region; + struct dax_device *dax_dev; + struct device dev; + int id; + int num_resources; + struct resource res[0]; +}; + +static ssize_t id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region; + ssize_t rc = -ENXIO; + + device_lock(dev); + dax_region = dev_get_drvdata(dev); + if (dax_region) + rc = sprintf(buf, "%d\n", dax_region->id); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(id); + +static ssize_t region_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region; + ssize_t rc = -ENXIO; + + device_lock(dev); + dax_region = dev_get_drvdata(dev); + if (dax_region) + rc = sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&dax_region->res)); + device_unlock(dev); + + return rc; +} +static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, + region_size_show, NULL); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region; + ssize_t rc = -ENXIO; + + device_lock(dev); + dax_region = dev_get_drvdata(dev); + if (dax_region) + rc = sprintf(buf, "%u\n", dax_region->align); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(align); + +static struct attribute *dax_region_attributes[] = { + &dev_attr_region_size.attr, + &dev_attr_align.attr, + &dev_attr_id.attr, + NULL, +}; + +static const struct attribute_group dax_region_attribute_group = { + .name = "dax_region", + .attrs = dax_region_attributes, +}; + +static const struct attribute_group *dax_region_attribute_groups[] = { + &dax_region_attribute_group, + NULL, +}; + +static void dax_region_free(struct kref *kref) +{ + struct dax_region *dax_region; + + dax_region = container_of(kref, struct dax_region, kref); + kfree(dax_region); +} + +void dax_region_put(struct dax_region *dax_region) +{ + kref_put(&dax_region->kref, dax_region_free); +} +EXPORT_SYMBOL_GPL(dax_region_put); + +static void dax_region_unregister(void *region) +{ + struct dax_region *dax_region = region; + + sysfs_remove_groups(&dax_region->dev->kobj, + dax_region_attribute_groups); + dax_region_put(dax_region); +} + +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct resource *res, unsigned int align, void *addr, + unsigned long pfn_flags) +{ + struct dax_region *dax_region; + + /* + * The DAX core assumes that it can store its private data in + * parent->driver_data. This WARN is a reminder / safeguard for + * developers of device-dax drivers. + */ + if (dev_get_drvdata(parent)) { + dev_WARN(parent, "dax core failed to setup private data\n"); + return NULL; + } + + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!dax_region) + return NULL; + + dev_set_drvdata(parent, dax_region); + memcpy(&dax_region->res, res, sizeof(*res)); + dax_region->pfn_flags = pfn_flags; + kref_init(&dax_region->kref); + dax_region->id = region_id; + ida_init(&dax_region->ida); + dax_region->align = align; + dax_region->dev = parent; + dax_region->base = addr; + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { + kfree(dax_region); + return NULL;; + } + + kref_get(&dax_region->kref); + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) + return NULL; + return dax_region; +} +EXPORT_SYMBOL_GPL(alloc_dax_region); + +static struct dev_dax *to_dev_dax(struct device *dev) +{ + return container_of(dev, struct dev_dax, dev); +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + unsigned long long size = 0; + int i; + + for (i = 0; i < dev_dax->num_resources; i++) + size += resource_size(&dev_dax->res[i]); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(size); + +static struct attribute *dev_dax_attributes[] = { + &dev_attr_size.attr, + NULL, +}; + +static const struct attribute_group dev_dax_attribute_group = { + .attrs = dev_dax_attributes, +}; + +static const struct attribute_group *dax_attribute_groups[] = { + &dev_dax_attribute_group, + NULL, +}; + +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, + const char *func) +{ + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + unsigned long mask; + + if (!dax_alive(dev_dax->dax_dev)) + return -ENXIO; + + /* prevent private mappings from being established */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + dev_info(dev, "%s: %s: fail, attempted private mapping\n", + current->comm, func); + return -EINVAL; + } + + mask = dax_region->align - 1; + if (vma->vm_start & mask || vma->vm_end & mask) { + dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", + current->comm, func, vma->vm_start, vma->vm_end, + mask); + return -EINVAL; + } + + if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV + && (vma->vm_flags & VM_DONTCOPY) == 0) { + dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", + current->comm, func); + return -EINVAL; + } + + if (!vma_is_dax(vma)) { + dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", + current->comm, func); + return -EINVAL; + } + + return 0; +} + +static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, + unsigned long size) +{ + struct resource *res; + phys_addr_t phys; + int i; + + for (i = 0; i < dev_dax->num_resources; i++) { + res = &dev_dax->res[i]; + phys = pgoff * PAGE_SIZE + res->start; + if (phys >= res->start && phys <= res->end) + break; + pgoff -= PHYS_PFN(resource_size(res)); + } + + if (i < dev_dax->num_resources) { + res = &dev_dax->res[i]; + if (phys + size - 1 <= res->end) + return phys; + } + + return -1; +} + +static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) +{ + struct device *dev = &dev_dax->dev; + struct dax_region *dax_region; + int rc = VM_FAULT_SIGBUS; + phys_addr_t phys; + pfn_t pfn; + unsigned int fault_size = PAGE_SIZE; + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + dax_region = dev_dax->region; + if (dax_region->align > PAGE_SIZE) { + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); + return VM_FAULT_SIGBUS; + } + + if (fault_size != dax_region->align) + return VM_FAULT_SIGBUS; + + phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); + if (phys == -1) { + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, + vmf->pgoff); + return VM_FAULT_SIGBUS; + } + + pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + + rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); + + if (rc == -ENOMEM) + return VM_FAULT_OOM; + if (rc < 0 && rc != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) +{ + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct device *dev = &dev_dax->dev; + struct dax_region *dax_region; + phys_addr_t phys; + pgoff_t pgoff; + pfn_t pfn; + unsigned int fault_size = PMD_SIZE; + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + dax_region = dev_dax->region; + if (dax_region->align > PMD_SIZE) { + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); + return VM_FAULT_SIGBUS; + } + + /* dax pmd mappings require pfn_t_devmap() */ + if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); + return VM_FAULT_SIGBUS; + } + + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pmd_addr < vmf->vma->vm_start || + (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + pgoff = linear_page_index(vmf->vma, pmd_addr); + phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); + if (phys == -1) { + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, + pgoff); + return VM_FAULT_SIGBUS; + } + + pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, + vmf->flags & FAULT_FLAG_WRITE); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) +{ + unsigned long pud_addr = vmf->address & PUD_MASK; + struct device *dev = &dev_dax->dev; + struct dax_region *dax_region; + phys_addr_t phys; + pgoff_t pgoff; + pfn_t pfn; + unsigned int fault_size = PUD_SIZE; + + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + dax_region = dev_dax->region; + if (dax_region->align > PUD_SIZE) { + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); + return VM_FAULT_SIGBUS; + } + + /* dax pud mappings require pfn_t_devmap() */ + if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); + return VM_FAULT_SIGBUS; + } + + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pud_addr < vmf->vma->vm_start || + (pud_addr + PUD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + pgoff = linear_page_index(vmf->vma, pud_addr); + phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); + if (phys == -1) { + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, + pgoff); + return VM_FAULT_SIGBUS; + } + + pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + + return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, + vmf->flags & FAULT_FLAG_WRITE); +} +#else +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) +{ + return VM_FAULT_FALLBACK; +} +#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +static int dev_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + int rc, id; + struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + + dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, + current->comm, (vmf->flags & FAULT_FLAG_WRITE) + ? "write" : "read", + vmf->vma->vm_start, vmf->vma->vm_end, pe_size); + + id = dax_read_lock(); + switch (pe_size) { + case PE_SIZE_PTE: + rc = __dev_dax_pte_fault(dev_dax, vmf); + break; + case PE_SIZE_PMD: + rc = __dev_dax_pmd_fault(dev_dax, vmf); + break; + case PE_SIZE_PUD: + rc = __dev_dax_pud_fault(dev_dax, vmf); + break; + default: + rc = VM_FAULT_SIGBUS; + } + dax_read_unlock(id); + + return rc; +} + +static int dev_dax_fault(struct vm_fault *vmf) +{ + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); +} + +static const struct vm_operations_struct dax_vm_ops = { + .fault = dev_dax_fault, + .huge_fault = dev_dax_huge_fault, +}; + +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct dev_dax *dev_dax = filp->private_data; + int rc, id; + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + + /* + * We lock to check dax_dev liveness and will re-check at + * fault time. + */ + id = dax_read_lock(); + rc = check_vma(dev_dax, vma, __func__); + dax_read_unlock(id); + if (rc) + return rc; + + vma->vm_ops = &dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dev_dax *dev_dax = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dev_dax || addr) + goto out; + + dax_region = dev_dax->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int dax_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct inode *__dax_inode = dax_inode(dax_dev); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + inode->i_mapping = __dax_inode->i_mapping; + inode->i_mapping->host = __dax_inode; + filp->f_mapping = inode->i_mapping; + filp->private_data = dev_dax; + inode->i_flags = S_DAX; + + return 0; +} + +static int dax_release(struct inode *inode, struct file *filp) +{ + struct dev_dax *dev_dax = filp->private_data; + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + return 0; +} + +static const struct file_operations dax_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, +}; + +static void dev_dax_release(struct device *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + struct dax_device *dax_dev = dev_dax->dax_dev; + + ida_simple_remove(&dax_region->ida, dev_dax->id); + dax_region_put(dax_region); + put_dax(dax_dev); + kfree(dev_dax); +} + +static void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + struct cdev *cdev = inode->i_cdev; + + dev_dbg(dev, "%s\n", __func__); + + kill_dev_dax(dev_dax); + cdev_device_del(cdev, dev); + put_device(dev); +} + +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, + struct resource *res, int count) +{ + struct device *parent = dax_region->dev; + struct dax_device *dax_dev; + struct dev_dax *dev_dax; + struct inode *inode; + struct device *dev; + struct cdev *cdev; + int rc = 0, i; + + dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); + if (!dev_dax) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (!IS_ALIGNED(res[i].start, dax_region->align) + || !IS_ALIGNED(resource_size(&res[i]), + dax_region->align)) { + rc = -EINVAL; + break; + } + dev_dax->res[i].start = res[i].start; + dev_dax->res[i].end = res[i].end; + } + + if (i < count) + goto err_id; + + dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dev_dax->id < 0) { + rc = dev_dax->id; + goto err_id; + } + + dax_dev = alloc_dax(dev_dax); + if (!dax_dev) + goto err_dax; + + /* from here on we're committed to teardown via dax_dev_release() */ + dev = &dev_dax->dev; + device_initialize(dev); + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &dax_fops); + cdev->owner = parent->driver->owner; + + dev_dax->num_resources = count; + dev_dax->dax_dev = dax_dev; + dev_dax->region = dax_region; + kref_get(&dax_region->kref); + + dev->devt = inode->i_rdev; + dev->class = dax_class; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dev_dax_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); + + rc = cdev_device_add(cdev, dev); + if (rc) { + kill_dev_dax(dev_dax); + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); + if (rc) + return ERR_PTR(rc); + + return dev_dax; + + err_dax: + ida_simple_remove(&dax_region->ida, dev_dax->id); + err_id: + kfree(dev_dax); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dev_dax); + +static int __init dax_init(void) +{ + dax_class = class_create(THIS_MODULE, "dax"); + return PTR_ERR_OR_ZERO(dax_class); +} + +static void __exit dax_exit(void) +{ + class_destroy(dax_class); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +subsys_initcall(dax_init); +module_exit(dax_exit); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 2c736fc4508b..d4ca19bd74eb 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -16,7 +16,7 @@ #include #include "../nvdimm/pfn.h" #include "../nvdimm/nd.h" -#include "dax.h" +#include "device-dax.h" struct dax_pmem { struct device *dev; diff --git a/drivers/dax/super.c b/drivers/dax/super.c new file mode 100644 index 000000000000..c9f85f1c086e --- /dev/null +++ b/drivers/dax/super.c @@ -0,0 +1,303 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +MODULE_PARM_DESC(nr_dax, "max number of dax device instances"); + +static dev_t dax_devt; +DEFINE_STATIC_SRCU(dax_srcu); +static struct vfsmount *dax_mnt; +static DEFINE_IDA(dax_minor_ida); +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; + +int dax_read_lock(void) +{ + return srcu_read_lock(&dax_srcu); +} +EXPORT_SYMBOL_GPL(dax_read_lock); + +void dax_read_unlock(int id) +{ + srcu_read_unlock(&dax_srcu, id); +} +EXPORT_SYMBOL_GPL(dax_read_unlock); + +/** + * struct dax_device - anchor object for dax services + * @inode: core vfs + * @cdev: optional character interface for "device dax" + * @private: dax driver private data + * @alive: !alive + rcu grace period == no new operations / mappings + */ +struct dax_device { + struct inode inode; + struct cdev cdev; + void *private; + bool alive; +}; + +bool dax_alive(struct dax_device *dax_dev) +{ + lockdep_assert_held(&dax_srcu); + return dax_dev->alive; +} +EXPORT_SYMBOL_GPL(dax_alive); + +/* + * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring + * that any fault handlers or operations that might have seen + * dax_alive(), have completed. Any operations that start after + * synchronize_srcu() has run will abort upon seeing !dax_alive(). + */ +void kill_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + + dax_dev->alive = false; + synchronize_srcu(&dax_srcu); + dax_dev->private = NULL; +} +EXPORT_SYMBOL_GPL(kill_dax); + +static struct inode *dax_alloc_inode(struct super_block *sb) +{ + struct dax_device *dax_dev; + + dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + return &dax_dev->inode; +} + +static struct dax_device *to_dax_dev(struct inode *inode) +{ + return container_of(inode, struct dax_device, inode); +} + +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct dax_device *dax_dev = to_dax_dev(inode); + + ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); + kmem_cache_free(dax_cache, dax_dev); +} + +static void dax_destroy_inode(struct inode *inode) +{ + struct dax_device *dax_dev = to_dax_dev(inode); + + WARN_ONCE(dax_dev->alive, + "kill_dax() must be called before final iput()\n"); + call_rcu(&inode->i_rcu, dax_i_callback); +} + +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} + +static struct file_system_type dax_fs_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + return inode->i_rdev == devt; +} + +static int dax_set(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + inode->i_rdev = devt; + return 0; +} + +static struct dax_device *dax_dev_get(dev_t devt) +{ + struct dax_device *dax_dev; + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, &devt); + + if (!inode) + return NULL; + + dax_dev = to_dax_dev(inode); + if (inode->i_state & I_NEW) { + dax_dev->alive = true; + inode->i_cdev = &dax_dev->cdev; + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + + return dax_dev; +} + +struct dax_device *alloc_dax(void *private) +{ + struct dax_device *dax_dev; + dev_t devt; + int minor; + + minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL); + if (minor < 0) + return NULL; + + devt = MKDEV(MAJOR(dax_devt), minor); + dax_dev = dax_dev_get(devt); + if (!dax_dev) + goto err_inode; + + dax_dev->private = private; + return dax_dev; + + err_inode: + ida_simple_remove(&dax_minor_ida, minor); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_dax); + +void put_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + iput(&dax_dev->inode); +} +EXPORT_SYMBOL_GPL(put_dax); + +/** + * inode_dax: convert a public inode into its dax_dev + * @inode: An inode with i_cdev pointing to a dax_dev + * + * Note this is not equivalent to to_dax_dev() which is for private + * internal use where we know the inode filesystem type == dax_fs_type. + */ +struct dax_device *inode_dax(struct inode *inode) +{ + struct cdev *cdev = inode->i_cdev; + + return container_of(cdev, struct dax_device, cdev); +} +EXPORT_SYMBOL_GPL(inode_dax); + +struct inode *dax_inode(struct dax_device *dax_dev) +{ + return &dax_dev->inode; +} +EXPORT_SYMBOL_GPL(dax_inode); + +void *dax_get_private(struct dax_device *dax_dev) +{ + return dax_dev->private; +} +EXPORT_SYMBOL_GPL(dax_get_private); + +static void init_once(void *_dax_dev) +{ + struct dax_device *dax_dev = _dax_dev; + struct inode *inode = &dax_dev->inode; + + inode_init_once(inode); +} + +static int __dax_fs_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_fs_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_fs_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_fs_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; +} + +static void __dax_fs_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_fs_type); + kmem_cache_destroy(dax_cache); +} + +static int __init dax_fs_init(void) +{ + int rc; + + rc = __dax_fs_init(); + if (rc) + return rc; + + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + __dax_fs_exit(); + return rc; +} + +static void __exit dax_fs_exit(void) +{ + unregister_chrdev_region(dax_devt, nr_dax); + ida_destroy(&dax_minor_ida); + __dax_fs_exit(); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +subsys_initcall(dax_fs_init); +module_exit(dax_fs_exit); diff --git a/include/linux/dax.h b/include/linux/dax.h index d8a3dc042e1c..5b62f5d19aea 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -8,6 +8,9 @@ struct iomap_ops; +int dax_read_lock(void); +void dax_read_unlock(int id); + /* * We use lowest available bit in exceptional entry for locking, one bit for * the entry size (PMD) and two more to tell us if the entry is a huge zero diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 405212be044a..2033ad03b8cd 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_ACPI_NFIT) += nfit.o -obj-$(CONFIG_DEV_DAX) += dax.o +ifeq ($(CONFIG_DAX),m) +obj-$(CONFIG_DAX) += dax.o +endif +obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o nfit-y := $(ACPI_SRC)/core.o @@ -48,9 +51,12 @@ nd_blk-y += config_check.o nd_e820-y := $(NVDIMM_SRC)/e820.o nd_e820-y += config_check.o -dax-y := $(DAX_SRC)/dax.o +dax-y := $(DAX_SRC)/super.o dax-y += config_check.o +device_dax-y := $(DAX_SRC)/device.o +device_dax-y += config_check.o + dax_pmem-y := $(DAX_SRC)/pmem.o dax_pmem-y += config_check.o -- cgit v1.2.3-59-g8ed1b From 72058005411ffddcae6c06f7b691d635489132af Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 19 Apr 2017 15:14:31 -0700 Subject: dax: add a facility to lookup a dax device by 'host' device name For the current block_device based filesystem-dax path, we need a way for it to lookup the dax_device associated with a block_device. Add a 'host' property of a dax_device that can be used for this purpose. It is a free form string, but for a dax_device associated with a block device it is the bdev name. This is a stop-gap until filesystems are able to mount on a dax-inode directly. Signed-off-by: Dan Williams --- drivers/dax/dax.h | 2 +- drivers/dax/device.c | 2 +- drivers/dax/super.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/dax.h | 1 + 4 files changed, 86 insertions(+), 6 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index 2472d9da96db..246a24d68d4c 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,7 +13,7 @@ #ifndef __DAX_H__ #define __DAX_H__ struct dax_device; -struct dax_device *alloc_dax(void *private); +struct dax_device *alloc_dax(void *private, const char *host); void put_dax(struct dax_device *dax_dev); bool dax_alive(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 19a42edbfa03..db68f4fa8ce0 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -645,7 +645,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, goto err_id; } - dax_dev = alloc_dax(dev_dax); + dax_dev = alloc_dax(dev_dax, NULL); if (!dax_dev) goto err_dax; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c9f85f1c086e..8d446674c1da 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -30,6 +30,10 @@ static DEFINE_IDA(dax_minor_ida); static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; +#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) +static struct hlist_head dax_host_list[DAX_HASH_SIZE]; +static DEFINE_SPINLOCK(dax_host_lock); + int dax_read_lock(void) { return srcu_read_lock(&dax_srcu); @@ -46,12 +50,15 @@ EXPORT_SYMBOL_GPL(dax_read_unlock); * struct dax_device - anchor object for dax services * @inode: core vfs * @cdev: optional character interface for "device dax" + * @host: optional name for lookups where the device path is not available * @private: dax driver private data * @alive: !alive + rcu grace period == no new operations / mappings */ struct dax_device { + struct hlist_node list; struct inode inode; struct cdev cdev; + const char *host; void *private; bool alive; }; @@ -63,6 +70,11 @@ bool dax_alive(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_alive); +static int dax_host_hash(const char *host) +{ + return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; +} + /* * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring * that any fault handlers or operations that might have seen @@ -75,7 +87,13 @@ void kill_dax(struct dax_device *dax_dev) return; dax_dev->alive = false; + synchronize_srcu(&dax_srcu); + + spin_lock(&dax_host_lock); + hlist_del_init(&dax_dev->list); + spin_unlock(&dax_host_lock); + dax_dev->private = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -98,6 +116,8 @@ static void dax_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct dax_device *dax_dev = to_dax_dev(inode); + kfree(dax_dev->host); + dax_dev->host = NULL; ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); kmem_cache_free(dax_cache, dax_dev); } @@ -169,26 +189,53 @@ static struct dax_device *dax_dev_get(dev_t devt) return dax_dev; } -struct dax_device *alloc_dax(void *private) +static void dax_add_host(struct dax_device *dax_dev, const char *host) +{ + int hash; + + /* + * Unconditionally init dax_dev since it's coming from a + * non-zeroed slab cache + */ + INIT_HLIST_NODE(&dax_dev->list); + dax_dev->host = host; + if (!host) + return; + + hash = dax_host_hash(host); + spin_lock(&dax_host_lock); + hlist_add_head(&dax_dev->list, &dax_host_list[hash]); + spin_unlock(&dax_host_lock); +} + +struct dax_device *alloc_dax(void *private, const char *__host) { struct dax_device *dax_dev; + const char *host; dev_t devt; int minor; + host = kstrdup(__host, GFP_KERNEL); + if (__host && !host) + return NULL; + minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL); if (minor < 0) - return NULL; + goto err_minor; devt = MKDEV(MAJOR(dax_devt), minor); dax_dev = dax_dev_get(devt); if (!dax_dev) - goto err_inode; + goto err_dev; + dax_add_host(dax_dev, host); dax_dev->private = private; return dax_dev; - err_inode: + err_dev: ida_simple_remove(&dax_minor_ida, minor); + err_minor: + kfree(host); return NULL; } EXPORT_SYMBOL_GPL(alloc_dax); @@ -201,6 +248,38 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +/** + * dax_get_by_host() - temporary lookup mechanism for filesystem-dax + * @host: alternate name for the device registered by a dax driver + */ +struct dax_device *dax_get_by_host(const char *host) +{ + struct dax_device *dax_dev, *found = NULL; + int hash, id; + + if (!host) + return NULL; + + hash = dax_host_hash(host); + + id = dax_read_lock(); + spin_lock(&dax_host_lock); + hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { + if (!dax_alive(dax_dev) + || strcmp(host, dax_dev->host) != 0) + continue; + + if (igrab(&dax_dev->inode)) + found = dax_dev; + break; + } + spin_unlock(&dax_host_lock); + dax_read_unlock(id); + + return found; +} +EXPORT_SYMBOL_GPL(dax_get_by_host); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/include/linux/dax.h b/include/linux/dax.h index 5b62f5d19aea..9b2d5ba10d7d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -10,6 +10,7 @@ struct iomap_ops; int dax_read_lock(void); void dax_read_unlock(int id); +struct dax_device *dax_get_by_host(const char *host); /* * We use lowest available bit in exceptional entry for locking, one bit for -- cgit v1.2.3-59-g8ed1b From 6568b08b77816cda2a95919c7494108d983d5941 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jan 2017 18:44:18 -0800 Subject: dax: introduce dax_operations Track a set of dax_operations per dax_device that can be set at alloc_dax() time. These operations will be used to stop the abuse of block_device_operations for communicating dax capabilities to filesystems. It will also be used to replace the "pmem api" and move pmem-specific cache maintenance, and other dax-driver-specific filesystem-dax operations, to dax device methods. In particular this allows us to stop abusing __copy_user_nocache(), via memcpy_to_pmem(), with a driver specific replacement. This is a standalone introduction of the operations. Follow on patches convert each dax-driver and teach fs/dax.c to use ->direct_access() from dax_operations instead of block_device_operations. Suggested-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/dax.h | 4 +++- drivers/dax/device.c | 6 +++++- drivers/dax/super.c | 6 +++++- include/linux/dax.h | 10 ++++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index 246a24d68d4c..617bbc24be2b 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,7 +13,9 @@ #ifndef __DAX_H__ #define __DAX_H__ struct dax_device; -struct dax_device *alloc_dax(void *private, const char *host); +struct dax_operations; +struct dax_device *alloc_dax(void *private, const char *host, + const struct dax_operations *ops); void put_dax(struct dax_device *dax_dev); bool dax_alive(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); diff --git a/drivers/dax/device.c b/drivers/dax/device.c index db68f4fa8ce0..a0db055054a4 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -645,7 +645,11 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, goto err_id; } - dax_dev = alloc_dax(dev_dax, NULL); + /* + * No 'host' or dax_operations since there is no access to this + * device outside of mmap of the resulting character device. + */ + dax_dev = alloc_dax(dev_dax, NULL, NULL); if (!dax_dev) goto err_dax; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 8d446674c1da..1a58542ee8fd 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -17,6 +17,7 @@ #include #include #include +#include #include static int nr_dax = CONFIG_NR_DEV_DAX; @@ -61,6 +62,7 @@ struct dax_device { const char *host; void *private; bool alive; + const struct dax_operations *ops; }; bool dax_alive(struct dax_device *dax_dev) @@ -208,7 +210,8 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host) spin_unlock(&dax_host_lock); } -struct dax_device *alloc_dax(void *private, const char *__host) +struct dax_device *alloc_dax(void *private, const char *__host, + const struct dax_operations *ops) { struct dax_device *dax_dev; const char *host; @@ -229,6 +232,7 @@ struct dax_device *alloc_dax(void *private, const char *__host) goto err_dev; dax_add_host(dax_dev, host); + dax_dev->ops = ops; dax_dev->private = private; return dax_dev; diff --git a/include/linux/dax.h b/include/linux/dax.h index 9b2d5ba10d7d..74ebb92b625a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,6 +7,16 @@ #include struct iomap_ops; +struct dax_device; +struct dax_operations { + /* + * direct_access: translate a device-relative + * logical-page-offset into an absolute physical pfn. Return the + * number of pages available for DAX at that pfn. + */ + long (*direct_access)(struct dax_device *, pgoff_t, long, + void **, pfn_t *); +}; int dax_read_lock(void); void dax_read_unlock(int id); -- cgit v1.2.3-59-g8ed1b From c1d6e828a35df524df2af277eedd1471d05e4f4c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jan 2017 23:02:09 -0800 Subject: pmem: add dax_operations support Setup a dax_device to have the same lifetime as the pmem block device and add a ->direct_access() method that is equivalent to pmem_direct_access(). Once fs/dax.c has been converted to use dax_operations the old pmem_direct_access() will be removed. Signed-off-by: Dan Williams --- drivers/dax/dax.h | 7 ----- drivers/nvdimm/Kconfig | 1 + drivers/nvdimm/pmem.c | 61 ++++++++++++++++++++++++++++++++--------- drivers/nvdimm/pmem.h | 7 +++-- include/linux/dax.h | 6 ++++ tools/testing/nvdimm/pmem-dax.c | 21 +++++++------- 6 files changed, 70 insertions(+), 33 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index 617bbc24be2b..f9e5feea742c 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,13 +13,6 @@ #ifndef __DAX_H__ #define __DAX_H__ struct dax_device; -struct dax_operations; -struct dax_device *alloc_dax(void *private, const char *host, - const struct dax_operations *ops); -void put_dax(struct dax_device *dax_dev); -bool dax_alive(struct dax_device *dax_dev); -void kill_dax(struct dax_device *dax_dev); struct dax_device *inode_dax(struct inode *inode); struct inode *dax_inode(struct dax_device *dax_dev); -void *dax_get_private(struct dax_device *dax_dev); #endif /* __DAX_H__ */ diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 59e750183b7f..5bdd499b5f4f 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -20,6 +20,7 @@ if LIBNVDIMM config BLK_DEV_PMEM tristate "PMEM: Persistent memory block device support" default LIBNVDIMM + select DAX select ND_BTT if BTT select ND_PFN if NVDIMM_PFN help diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b536be5a12e..fbbcf8154eec 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "pmem.h" #include "pfn.h" @@ -199,13 +200,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ -__weak long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct pmem_device *pmem = bdev->bd_queue->queuedata; - resource_size_t offset = sector * 512 + pmem->data_offset; + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) + if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)))) return -EIO; *kaddr = pmem->virt_addr + offset; *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); @@ -215,26 +216,51 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector, * requested range. */ if (unlikely(pmem->bb.count)) - return size; - return pmem->size - pmem->pfn_pad - offset; + return nr_pages; + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); +} + +static long pmem_blk_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, pfn_t *pfn, long size) +{ + struct pmem_device *pmem = bdev->bd_queue->queuedata; + + return __pmem_direct_access(pmem, PHYS_PFN(sector * 512), + PHYS_PFN(size), kaddr, pfn); } static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .rw_page = pmem_rw_page, - .direct_access = pmem_direct_access, + .direct_access = pmem_blk_direct_access, .revalidate_disk = nvdimm_revalidate_disk, }; +static long pmem_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); +} + +static const struct dax_operations pmem_dax_ops = { + .direct_access = pmem_dax_direct_access, +}; + static void pmem_release_queue(void *q) { blk_cleanup_queue(q); } -static void pmem_release_disk(void *disk) +static void pmem_release_disk(void *__pmem) { - del_gendisk(disk); - put_disk(disk); + struct pmem_device *pmem = __pmem; + + kill_dax(pmem->dax_dev); + put_dax(pmem->dax_dev); + del_gendisk(pmem->disk); + put_disk(pmem->disk); } static int pmem_attach_disk(struct device *dev, @@ -245,6 +271,7 @@ static int pmem_attach_disk(struct device *dev, struct vmem_altmap __altmap, *altmap = NULL; struct resource *res = &nsio->res; struct nd_pfn *nd_pfn = NULL; + struct dax_device *dax_dev; int nid = dev_to_node(dev); struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; @@ -325,6 +352,7 @@ static int pmem_attach_disk(struct device *dev, disk = alloc_disk_node(0, nid); if (!disk) return -ENOMEM; + pmem->disk = disk; disk->fops = &pmem_fops; disk->queue = q; @@ -336,9 +364,16 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; nvdimm_badblocks_populate(nd_region, &pmem->bb, res); disk->bb = &pmem->bb; - device_add_disk(dev, disk); - if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) + dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); + if (!dax_dev) { + put_disk(disk); + return -ENOMEM; + } + pmem->dax_dev = dax_dev; + + device_add_disk(dev, disk); + if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; revalidate_disk(disk); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index b4ee4f71b4a1..7f4dbd72a90a 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -5,8 +5,6 @@ #include #include -long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size); /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ @@ -20,5 +18,10 @@ struct pmem_device { /* trim size when namespace capacity has been section aligned */ u32 pfn_pad; struct badblocks bb; + struct dax_device *dax_dev; + struct gendisk *disk; }; + +long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn); #endif /* __NVDIMM_PMEM_H__ */ diff --git a/include/linux/dax.h b/include/linux/dax.h index 74ebb92b625a..39a0312c45c3 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -21,6 +21,12 @@ struct dax_operations { int dax_read_lock(void); void dax_read_unlock(int id); struct dax_device *dax_get_by_host(const char *host); +struct dax_device *alloc_dax(void *private, const char *host, + const struct dax_operations *ops); +void put_dax(struct dax_device *dax_dev); +bool dax_alive(struct dax_device *dax_dev); +void kill_dax(struct dax_device *dax_dev); +void *dax_get_private(struct dax_device *dax_dev); /* * We use lowest available bit in exceptional entry for locking, one bit for diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c index c9b8c48f85fc..b53596ad601b 100644 --- a/tools/testing/nvdimm/pmem-dax.c +++ b/tools/testing/nvdimm/pmem-dax.c @@ -15,13 +15,13 @@ #include #include -long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct pmem_device *pmem = bdev->bd_queue->queuedata; - resource_size_t offset = sector * 512 + pmem->data_offset; + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) + if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)))) return -EIO; /* @@ -34,11 +34,10 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector, *kaddr = pmem->virt_addr + offset; page = vmalloc_to_page(pmem->virt_addr + offset); *pfn = page_to_pfn_t(page); - dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent, - "%s: sector: %#llx pfn: %#lx\n", __func__, - (unsigned long long) sector, page_to_pfn(page)); + pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n", + __func__, pmem, pgoff, page_to_pfn(page)); - return PAGE_SIZE; + return 1; } *kaddr = pmem->virt_addr + offset; @@ -49,6 +48,6 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector, * requested range. */ if (unlikely(pmem->bb.count)) - return size; - return pmem->size - pmem->pfn_pad - offset; + return nr_pages; + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); } -- cgit v1.2.3-59-g8ed1b From b0686260fecaa924d8eff2ace94bee70506bc308 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Jan 2017 20:37:35 -0800 Subject: dax: introduce dax_direct_access() Replace bdev_direct_access() with dax_direct_access() that uses dax_device and dax_operations instead of a block_device and block_device_operations for dax. Once all consumers of the old api have been converted bdev_direct_access() will be deleted. Given that block device partitioning decisions can cause dax page alignment constraints to be violated this also introduces the bdev_dax_pgoff() helper. It handles calculating a logical pgoff relative to the dax_device and also checks for page alignment. Signed-off-by: Dan Williams --- block/Kconfig | 1 + drivers/dax/super.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/block_dev.c | 14 ++++++++++++++ include/linux/blkdev.h | 1 + include/linux/dax.h | 2 ++ 5 files changed, 57 insertions(+) (limited to 'drivers/dax') diff --git a/block/Kconfig b/block/Kconfig index e9f780f815f5..93da7fc3f254 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -6,6 +6,7 @@ menuconfig BLOCK default y select SBITMAP select SRCU + select DAX help Provide block layer support for the kernel. diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 1a58542ee8fd..465dcd7317d5 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -65,6 +65,45 @@ struct dax_device { const struct dax_operations *ops; }; +/** + * dax_direct_access() - translate a device pgoff to an absolute pfn + * @dax_dev: a dax_device instance representing the logical memory range + * @pgoff: offset in pages from the start of the device to translate + * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @kaddr: output parameter that returns a virtual address mapping of pfn + * @pfn: output parameter that returns an absolute pfn translation of @pgoff + * + * Return: negative errno if an error occurs, otherwise the number of + * pages accessible at the device relative @pgoff. + */ +long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn) +{ + long avail; + + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + + if (!dax_dev) + return -EOPNOTSUPP; + + if (!dax_alive(dax_dev)) + return -ENXIO; + + if (nr_pages < 0) + return nr_pages; + + avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, + kaddr, pfn); + if (!avail) + return -ERANGE; + return min(avail, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_direct_access); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/fs/block_dev.c b/fs/block_dev.c index 7f40ea2f0875..2f7885712575 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -762,6 +763,19 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) +{ + phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; + + if (pgoff) + *pgoff = PHYS_PFN(phys_off); + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(bdev_dax_pgoff); + /** * bdev_dax_supported() - Check if the device supports dax for filesystem * @sb: The superblock of the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72708399b83..612c497d1461 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1958,6 +1958,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); extern int bdev_dax_supported(struct super_block *, int); +int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/dax.h b/include/linux/dax.h index 39a0312c45c3..7e62e280c11f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -27,6 +27,8 @@ void put_dax(struct dax_device *dax_dev); bool dax_alive(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); +long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn); /* * We use lowest available bit in exceptional entry for locking, one bit for -- cgit v1.2.3-59-g8ed1b From 565851c972b50612f3a4542e26879ffb3e906fc2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 30 Apr 2017 06:57:01 -0700 Subject: device-dax: fix sysfs attribute deadlock Usage of device_lock() for dax_region attributes is unnecessary and deadlock prone. It's unnecessary because the order of registration / un-registration guarantees that drvdata is always valid. It's deadlock prone because it sets up this situation: ndctl D 0 2170 2082 0x00000000 Call Trace: __schedule+0x31f/0x980 schedule+0x3d/0x90 schedule_preempt_disabled+0x15/0x20 __mutex_lock+0x402/0x980 ? __mutex_lock+0x158/0x980 ? align_show+0x2b/0x80 [dax] ? kernfs_seq_start+0x2f/0x90 mutex_lock_nested+0x1b/0x20 align_show+0x2b/0x80 [dax] dev_attr_show+0x20/0x50 ndctl D 0 2186 2079 0x00000000 Call Trace: __schedule+0x31f/0x980 schedule+0x3d/0x90 __kernfs_remove+0x1f6/0x340 ? kernfs_remove_by_name_ns+0x45/0xa0 ? remove_wait_queue+0x70/0x70 kernfs_remove_by_name_ns+0x45/0xa0 remove_files.isra.1+0x35/0x70 sysfs_remove_group+0x44/0x90 sysfs_remove_groups+0x2e/0x50 dax_region_unregister+0x25/0x40 [dax] devm_action_release+0xf/0x20 release_nodes+0x16d/0x2b0 devres_release_all+0x3c/0x60 device_release_driver_internal+0x17d/0x220 device_release_driver+0x12/0x20 unbind_store+0x112/0x160 ndctl/2170 is trying to acquire the device_lock() to read an attribute, and ndctl/2186 is holding the device_lock() while trying to drain all active attribute readers. Thanks to Yi Zhang for the reproduction script. Fixes: d7fe1a67f658 ("dax: add region 'id', 'size', and 'align' attributes") Cc: Reported-by: Yi Zhang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index ef93aa84622b..5e8302d3a89c 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -36,36 +36,27 @@ static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); +/* + * Rely on the fact that drvdata is set before the attributes are + * registered, and that the attributes are unregistered before drvdata + * is cleared to assume that drvdata is always valid. + */ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; + struct dax_region *dax_region = dev_get_drvdata(dev); - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%d\n", dax_region->id); - device_unlock(dev); - - return rc; + return sprintf(buf, "%d\n", dax_region->id); } static DEVICE_ATTR_RO(id); static ssize_t region_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; + struct dax_region *dax_region = dev_get_drvdata(dev); - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); - device_unlock(dev); - - return rc; + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&dax_region->res)); } static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); @@ -73,16 +64,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; + struct dax_region *dax_region = dev_get_drvdata(dev); - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%u\n", dax_region->align); - device_unlock(dev); - - return rc; + return sprintf(buf, "%u\n", dax_region->align); } static DEVICE_ATTR_RO(align); -- cgit v1.2.3-59-g8ed1b