aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems/locking.rst
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems/locking.rst')
-rw-r--r--Documentation/filesystems/locking.rst81
1 files changed, 54 insertions, 27 deletions
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 2183fd8cc350..d36fe79167b3 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -70,7 +70,7 @@ prototypes::
const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, unsigned int);
- int (*get_acl)(struct inode *, int);
+ struct posix_acl * (*get_acl)(struct inode *, int, bool);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
@@ -271,19 +271,19 @@ prototypes::
locking rules:
All except set_page_dirty and freepage may block
-====================== ======================== =========
-ops PageLocked(page) i_rwsem
-====================== ======================== =========
+====================== ======================== ========= ===============
+ops PageLocked(page) i_rwsem invalidate_lock
+====================== ======================== ========= ===============
writepage: yes, unlocks (see below)
-readpage: yes, unlocks
+readpage: yes, unlocks shared
writepages:
set_page_dirty no
-readahead: yes, unlocks
-readpages: no
+readahead: yes, unlocks shared
+readpages: no shared
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
-invalidatepage: yes
+invalidatepage: yes exclusive
releasepage: yes
freepage: yes
direct_IO:
@@ -295,7 +295,7 @@ is_partially_uptodate: yes
error_remove_page: yes
swap_activate: no
swap_deactivate: no
-====================== ======================== =========
+====================== ======================== ========= ===============
->write_begin(), ->write_end() and ->readpage() may be called from
the request handler (/dev/loop).
@@ -378,7 +378,10 @@ keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
some or all of the buffers from the page when it is being truncated. It
returns zero on success. If ->invalidatepage is zero, the kernel uses
-block_invalidatepage() instead.
+block_invalidatepage() instead. The filesystem must exclusively acquire
+invalidate_lock before invalidating page cache in truncate / hole punch path
+(and thus calling into ->invalidatepage) to block races between page cache
+invalidation and page cache filling functions (fault, read, ...).
->releasepage() is called when the kernel is about to try to drop the
buffers from the page in preparation for freeing it. It returns zero to
@@ -506,6 +509,7 @@ prototypes::
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iopoll) (struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -518,12 +522,6 @@ prototypes::
int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
- ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
- loff_t *);
- ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
- loff_t *);
- ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
- void __user *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
@@ -536,6 +534,14 @@ prototypes::
size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *, int, loff_t, loff_t);
+ void (*show_fdinfo)(struct seq_file *m, struct file *f);
+ unsigned (*mmap_capabilities)(struct file *);
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
+ loff_t, size_t, unsigned int);
+ loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+ int (*fadvise)(struct file *, loff_t, loff_t, int);
locking rules:
All may block.
@@ -570,6 +576,25 @@ in sys_read() and friends.
the lease within the individual filesystem to record the result of the
operation
+->fallocate implementation must be really careful to maintain page cache
+consistency when punching holes or performing other operations that invalidate
+page cache contents. Usually the filesystem needs to call
+truncate_inode_pages_range() to invalidate relevant range of the page cache.
+However the filesystem usually also needs to update its internal (and on disk)
+view of file offset -> disk block mapping. Until this update is finished, the
+filesystem needs to block page faults and reads from reloading now-stale page
+cache contents from the disk. Since VFS acquires mapping->invalidate_lock in
+shared mode when loading pages from disk (filemap_fault(), filemap_read(),
+readahead paths), the fallocate implementation must take the invalidate_lock to
+prevent reloading.
+
+->copy_file_range and ->remap_file_range implementations need to serialize
+against modifications of file data while the operation is running. For
+blocking changes through write(2) and similar operations inode->i_rwsem can be
+used. To block changes to file contents via a memory mapping during the
+operation, the filesystem must take mapping->invalidate_lock to coordinate
+with ->page_mkwrite.
+
dquot_operations
================
@@ -627,11 +652,11 @@ pfn_mkwrite: yes
access: yes
============= ========= ===========================
-->fault() is called when a previously not present pte is about
-to be faulted in. The filesystem must find and return the page associated
-with the passed in "pgoff" in the vm_fault structure. If it is possible that
-the page may be truncated and/or invalidated, then the filesystem must lock
-the page, then ensure it is not already truncated (the page lock will block
+->fault() is called when a previously not present pte is about to be faulted
+in. The filesystem must find and return the page associated with the passed in
+"pgoff" in the vm_fault structure. If it is possible that the page may be
+truncated and/or invalidated, then the filesystem must lock invalidate_lock,
+then ensure the page is not already truncated (invalidate_lock will block
subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
@@ -644,12 +669,14 @@ page table entry. Pointer to entry associated with the page is passed in
"pte" field in vm_fault structure. Pointers to entries for other offsets
should be calculated relative to "pte".
-->page_mkwrite() is called when a previously read-only pte is
-about to become writeable. The filesystem again must ensure that there are
-no truncate/invalidate races, and then return with the page locked. If
-the page has been truncated, the filesystem should not look up a new page
-like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
-will cause the VM to retry the fault.
+->page_mkwrite() is called when a previously read-only pte is about to become
+writeable. The filesystem again must ensure that there are no
+truncate/invalidate races or races with operations such as ->remap_file_range
+or ->copy_file_range, and then return with the page locked. Usually
+mapping->invalidate_lock is suitable for proper serialization. If the page has
+been truncated, the filesystem should not look up a new page like the ->fault()
+handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to
+retry the fault.
->pfn_mkwrite() is the same as page_mkwrite but when the pte is
VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is