From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:18 +0200 Subject: dax: New fault locking Currently DAX page fault locking is racy. CPU0 (write fault) CPU1 (read fault) __dax_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) get_block(inode, block, &bh, 1) -> allocates blocks if (page) -> no if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) { } else { dax_load_hole(); } dax_insert_mapping() And we are in a situation where we fail in dax_radix_entry() with -EIO. Another problem with the current DAX page fault locking is that there is no race-free way to clear dirty tag in the radix tree. We can always end up with clean radix tree and dirty data in CPU cache. We fix the first problem by introducing locking of exceptional radix tree entries in DAX mappings acting very similarly to page lock and thus synchronizing properly faults against the same mapping index. The same lock can later be used to avoid races when clearing radix tree dirty tag. Reviewed-by: NeilBrown Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/dax.h') diff --git a/include/linux/dax.h b/include/linux/dax.h index aa148937bb3f..756625c6d0dd 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); -- cgit v1.2.3-59-g8ed1b