aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Makefile4
-rw-r--r--fs/afs/cell.c328
-rw-r--r--fs/afs/dynroot.c23
-rw-r--r--fs/afs/inode.c47
-rw-r--r--fs/afs/internal.h21
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/proc.c23
-rw-r--r--fs/afs/server.c7
-rw-r--r--fs/afs/super.c19
-rw-r--r--fs/afs/vl_alias.c8
-rw-r--r--fs/afs/vl_rotate.c2
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/aio.c8
-rw-r--r--fs/autofs/dev-ioctl.c8
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/binfmt_elf.c266
-rw-r--r--fs/binfmt_elf_fdpic.c162
-rw-r--r--fs/block_dev.c184
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c66
-rw-r--r--fs/btrfs/btrfs_inode.h30
-rw-r--r--fs/btrfs/compression.c35
-rw-r--r--fs/btrfs/compression.h35
-rw-r--r--fs/btrfs/ctree.c204
-rw-r--r--fs/btrfs/ctree.h103
-rw-r--r--fs/btrfs/delalloc-space.c123
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c118
-rw-r--r--fs/btrfs/disk-io.c170
-rw-r--r--fs/btrfs/disk-io.h9
-rw-r--r--fs/btrfs/extent-io-tree.h3
-rw-r--r--fs/btrfs/extent-tree.c206
-rw-r--r--fs/btrfs/extent_io.c216
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c316
-rw-r--r--fs/btrfs/free-space-cache.c23
-rw-r--r--fs/btrfs/inode.c788
-rw-r--r--fs/btrfs/ioctl.c71
-rw-r--r--fs/btrfs/locking.c45
-rw-r--r--fs/btrfs/locking.h78
-rw-r--r--fs/btrfs/ordered-data.c113
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/print-tree.c38
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/reada.c30
-rw-r--r--fs/btrfs/reflink.c46
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c13
-rw-r--r--fs/btrfs/scrub.c8
-rw-r--r--fs/btrfs/send.c365
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/space-info.c323
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/struct-funcs.c10
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/sysfs.c249
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c7
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c17
-rw-r--r--fs/btrfs/tree-log.c284
-rw-r--r--fs/btrfs/tree-log.h32
-rw-r--r--fs/btrfs/volumes.c416
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c22
-rw-r--r--fs/ceph/addr.c416
-rw-r--r--fs/ceph/caps.c128
-rw-r--r--fs/ceph/debugfs.c18
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/file.c85
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c109
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/metric.c14
-rw-r--r--fs/ceph/metric.h7
-rw-r--r--fs/ceph/quota.c10
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/ceph/super.h13
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c236
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c60
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/hooks.c80
-rw-r--r--fs/crypto/inline_crypt.c7
-rw-r--r--fs/crypto/keyring.c9
-rw-r--r--fs/crypto/keysetup.c182
-rw-r--r--fs/crypto/keysetup_v1.c8
-rw-r--r--fs/crypto/policy.c209
-rw-r--r--fs/d_path.c6
-rw-r--r--fs/dax.c42
-rw-r--r--fs/direct-io.c88
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c66
-rw-r--r--fs/dlm/config.h4
-rw-r--r--fs/dlm/lowcomms.c329
-rw-r--r--fs/dlm/midcomms.c136
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/dlm/netlink.c6
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c48
-rw-r--r--fs/eventpoll.c72
-rw-r--r--fs/exec.c155
-rw-r--r--fs/exfat/cache.c11
-rw-r--r--fs/exfat/dir.c29
-rw-r--r--fs/exfat/exfat_fs.h7
-rw-r--r--fs/exfat/file.c4
-rw-r--r--fs/exfat/inode.c7
-rw-r--r--fs/exfat/namei.c166
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c6
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/super.c16
-rw-r--r--fs/ext4/verity.c4
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c17
-rw-r--r--fs/f2fs/compress.c242
-rw-r--r--fs/f2fs/data.c119
-rw-r--r--fs/f2fs/debug.c18
-rw-r--r--fs/f2fs/dir.c115
-rw-r--r--fs/f2fs/extent_cache.c37
-rw-r--r--fs/f2fs/f2fs.h143
-rw-r--r--fs/f2fs/file.c88
-rw-r--r--fs/f2fs/gc.c413
-rw-r--r--fs/f2fs/gc.h69
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c21
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c7
-rw-r--r--fs/f2fs/segment.c522
-rw-r--r--fs/f2fs/segment.h71
-rw-r--r--fs/f2fs/super.c183
-rw-r--r--fs/f2fs/sysfs.c22
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/Makefile6
-rw-r--r--fs/fuse/control.c20
-rw-r--r--fs/fuse/cuse.c21
-rw-r--r--fs/fuse/dax.c1365
-rw-r--r--fs/fuse/dev.c189
-rw-r--r--fs/fuse/dir.c220
-rw-r--r--fs/fuse/file.c276
-rw-r--r--fs/fuse/fuse_i.h185
-rw-r--r--fs/fuse/inode.c395
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/fuse/virtio_fs.c378
-rw-r--r--fs/fuse/xattr.c34
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io-wq.c243
-rw-r--r--fs/io-wq.h14
-rw-r--r--fs/io_uring.c2793
-rw-r--r--fs/iomap/buffered-io.c194
-rw-r--r--fs/iomap/direct-io.c49
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/kernel_read_file.c189
-rw-r--r--fs/libfs.c87
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c43
-rw-r--r--fs/nfs/fs_context.c196
-rw-r--r--fs/nfs/namespace.c12
-rw-r--r--fs/nfs/nfs42proc.c10
-rw-r--r--fs/nfs/nfs42xattr.c5
-rw-r--r--fs/nfs/nfs42xdr.c167
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4idmap.c15
-rw-r--r--fs/nfs/nfs4proc.c272
-rw-r--r--fs/nfs/nfs4trace.h1
-rw-r--r--fs/nfs/nfs4xdr.c7
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/sysfs.c11
-rw-r--r--fs/nfs/sysfs.h2
-rw-r--r--fs/nfsd/blocklayout.c4
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/cpfile.c6
-rw-r--r--fs/nilfs2/page.c1
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/notify/fanotify/fanotify.c5
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c28
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c88
-rw-r--r--fs/overlayfs/inode.c32
-rw-r--r--fs/overlayfs/namei.c57
-rw-r--r--fs/overlayfs/overlayfs.h92
-rw-r--r--fs/overlayfs/ovl_entry.h6
-rw-r--r--fs/overlayfs/readdir.c76
-rw-r--r--fs/overlayfs/super.c117
-rw-r--r--fs/overlayfs/util.c96
-rw-r--r--fs/pipe.c73
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c126
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c120
-rw-r--r--fs/quota/compat.h34
-rw-r--r--fs/quota/quota.c115
-rw-r--r--fs/quota/quota_v2.c1
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c370
-rw-r--r--fs/reiserfs/inode.c9
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/splice.c85
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/file.c55
-rw-r--r--fs/ubifs/auth.c2
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/dir.c40
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/ioctl.c1
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c6
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c7
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c61
-rw-r--r--fs/udf/misc.c6
-rw-r--r--fs/udf/namei.c7
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c47
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/unicode/utf8-core.c23
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/vboxsf/dir.c2
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/xattr.c22
-rw-r--r--fs/xfs/Kconfig25
-rw-r--r--fs/xfs/kmem.c22
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c14
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c43
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h29
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c19
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h24
-rw-r--r--fs/xfs/libxfs/xfs_defer.c232
-rw-r--r--fs/xfs/libxfs/xfs_defer.h37
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c35
-rw-r--r--fs/xfs/libxfs/xfs_format.h211
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c130
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h17
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c8
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h8
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c27
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_sb.c6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c17
-rw-r--r--fs/xfs/scrub/agheader.c30
-rw-r--r--fs/xfs/scrub/agheader_repair.c24
-rw-r--r--fs/xfs/scrub/dabtree.c14
-rw-r--r--fs/xfs/scrub/inode.c31
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_list.c6
-rw-r--r--fs/xfs/xfs_bmap_item.c132
-rw-r--r--fs/xfs/xfs_bmap_util.c16
-rw-r--r--fs/xfs/xfs_buf.c208
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c264
-rw-r--r--fs/xfs/xfs_buf_item.h12
-rw-r--r--fs/xfs/xfs_buf_item_recover.c4
-rw-r--r--fs/xfs/xfs_dquot.c70
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c44
-rw-r--r--fs/xfs/xfs_file.c17
-rw-r--r--fs/xfs/xfs_filestream.c34
-rw-r--r--fs/xfs/xfs_fsmap.c48
-rw-r--r--fs/xfs/xfs_fsmap.h6
-rw-r--r--fs/xfs/xfs_icache.c19
-rw-r--r--fs/xfs/xfs_inode.c206
-rw-r--r--fs/xfs/xfs_inode.h38
-rw-r--r--fs/xfs/xfs_inode_item.c61
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_inode_item_recover.c76
-rw-r--r--fs/xfs/xfs_ioctl.c151
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log.c44
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_recover.c281
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_ondisk.h38
-rw-r--r--fs/xfs/xfs_qm.c29
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_refcount_item.c51
-rw-r--r--fs/xfs/xfs_rmap_item.c42
-rw-r--r--fs/xfs/xfs_rtalloc.c44
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c72
-rw-r--r--fs/xfs/xfs_sysctl.c36
-rw-r--r--fs/xfs/xfs_trace.h30
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans.h35
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_dquot.c49
-rw-r--r--fs/zonefs/super.c221
-rw-r--r--fs/zonefs/zonefs.h10
356 files changed, 15477 insertions, 8918 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3576123d8299..b177fd3b1eb3 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -612,9 +612,9 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = WB_SYNC_ALL,
- .range_start = vma->vm_pgoff * PAGE_SIZE,
+ .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
/* absolute end, byte at end included */
- .range_end = vma->vm_pgoff * PAGE_SIZE +
+ .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
(vma->vm_end - vma->vm_start - 1),
};
@@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
inode = file_inode(vma->vm_file);
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 74df32be4c6a..e34fa20acf61 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (ret)
return ret;
- if (v9ses->cache)
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+ if (!v9ses->cache) {
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+ }
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..7bb2a05fda1f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o
+ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+ kernel_read_file.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5b79cdceefa0..52233fa6195f 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -18,8 +18,10 @@
static unsigned __read_mostly afs_cell_gc_delay = 10;
static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
+static atomic_t cell_debug_id;
-static void afs_manage_cell(struct work_struct *);
+static void afs_queue_cell_manager(struct afs_net *);
+static void afs_manage_cell_work(struct work_struct *);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
@@ -37,19 +39,22 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
atomic_inc(&net->cells_outstanding);
if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
afs_dec_cells_outstanding(net);
+ } else {
+ afs_queue_cell_manager(net);
}
}
/*
- * Look up and get an activation reference on a cell record under RCU
- * conditions. The caller must hold the RCU read lock.
+ * Look up and get an activation reference on a cell record. The caller must
+ * hold net->cells_lock at least read-locked.
*/
-struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
- const char *name, unsigned int namesz)
+static struct afs_cell *afs_find_cell_locked(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ enum afs_cell_trace reason)
{
struct afs_cell *cell = NULL;
struct rb_node *p;
- int n, seq = 0, ret = 0;
+ int n;
_enter("%*.*s", namesz, namesz, name);
@@ -58,61 +63,48 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
if (namesz > AFS_MAXCELLNAME)
return ERR_PTR(-ENAMETOOLONG);
- do {
- /* Unfortunately, rbtree walking doesn't give reliable results
- * under just the RCU read lock, so we have to check for
- * changes.
- */
- if (cell)
- afs_put_cell(net, cell);
- cell = NULL;
- ret = -ENOENT;
-
- read_seqbegin_or_lock(&net->cells_lock, &seq);
-
- if (!name) {
- cell = rcu_dereference_raw(net->ws_cell);
- if (cell) {
- afs_get_cell(cell);
- ret = 0;
- break;
- }
- ret = -EDESTADDRREQ;
- continue;
- }
+ if (!name) {
+ cell = net->ws_cell;
+ if (!cell)
+ return ERR_PTR(-EDESTADDRREQ);
+ goto found;
+ }
- p = rcu_dereference_raw(net->cells.rb_node);
- while (p) {
- cell = rb_entry(p, struct afs_cell, net_node);
-
- n = strncasecmp(cell->name, name,
- min_t(size_t, cell->name_len, namesz));
- if (n == 0)
- n = cell->name_len - namesz;
- if (n < 0) {
- p = rcu_dereference_raw(p->rb_left);
- } else if (n > 0) {
- p = rcu_dereference_raw(p->rb_right);
- } else {
- if (atomic_inc_not_zero(&cell->usage)) {
- ret = 0;
- break;
- }
- /* We want to repeat the search, this time with
- * the lock properly locked.
- */
- }
- cell = NULL;
- }
+ p = net->cells.rb_node;
+ while (p) {
+ cell = rb_entry(p, struct afs_cell, net_node);
- } while (need_seqretry(&net->cells_lock, seq));
+ n = strncasecmp(cell->name, name,
+ min_t(size_t, cell->name_len, namesz));
+ if (n == 0)
+ n = cell->name_len - namesz;
+ if (n < 0)
+ p = p->rb_left;
+ else if (n > 0)
+ p = p->rb_right;
+ else
+ goto found;
+ }
- done_seqretry(&net->cells_lock, seq);
+ return ERR_PTR(-ENOENT);
- if (ret != 0 && cell)
- afs_put_cell(net, cell);
+found:
+ return afs_use_cell(cell, reason);
+}
- return ret == 0 ? cell : ERR_PTR(ret);
+/*
+ * Look up and get an activation reference on a cell record.
+ */
+struct afs_cell *afs_find_cell(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ enum afs_cell_trace reason)
+{
+ struct afs_cell *cell;
+
+ down_read(&net->cells_lock);
+ cell = afs_find_cell_locked(net, name, namesz, reason);
+ up_read(&net->cells_lock);
+ return cell;
}
/*
@@ -166,8 +158,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name[i] = tolower(name[i]);
cell->name[i] = 0;
- atomic_set(&cell->usage, 2);
- INIT_WORK(&cell->manager, afs_manage_cell);
+ atomic_set(&cell->ref, 1);
+ atomic_set(&cell->active, 0);
+ INIT_WORK(&cell->manager, afs_manage_cell_work);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
@@ -206,6 +199,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->dns_source = vllist->source;
cell->dns_status = vllist->status;
smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
+ atomic_inc(&net->cells_outstanding);
+ cell->debug_id = atomic_inc_return(&cell_debug_id);
+ trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
_leave(" = %p", cell);
return cell;
@@ -245,9 +241,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
_enter("%s,%s", name, vllist);
if (!excl) {
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(net, name, namesz);
- rcu_read_unlock();
+ cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
if (!IS_ERR(cell))
goto wait_for_cell;
}
@@ -268,7 +262,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
/* Find the insertion point and check to see if someone else added a
* cell whilst we were allocating.
*/
- write_seqlock(&net->cells_lock);
+ down_write(&net->cells_lock);
pp = &net->cells.rb_node;
parent = NULL;
@@ -290,23 +284,26 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
+ atomic_set(&cell->active, 2);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
- atomic_inc(&net->cells_outstanding);
- write_sequnlock(&net->cells_lock);
+ up_write(&net->cells_lock);
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_new);
wait_for_cell:
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+ afs_cell_trace_wait);
_debug("wait_for_cell");
wait_var_event(&cell->state,
({
state = smp_load_acquire(&cell->state); /* vs error */
- state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED;
+ state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
}));
/* Check the state obtained from the wait check. */
- if (state == AFS_CELL_FAILED) {
+ if (state == AFS_CELL_REMOVED) {
ret = cell->error;
goto error;
}
@@ -320,16 +317,17 @@ cell_already_exists:
if (excl) {
ret = -EEXIST;
} else {
- afs_get_cell(cursor);
+ afs_use_cell(cursor, afs_cell_trace_use_lookup);
ret = 0;
}
- write_sequnlock(&net->cells_lock);
- kfree(candidate);
+ up_write(&net->cells_lock);
+ if (candidate)
+ afs_put_cell(candidate, afs_cell_trace_put_candidate);
if (ret == 0)
goto wait_for_cell;
goto error_noput;
error:
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
error_noput:
_leave(" = %d [error]", ret);
return ERR_PTR(ret);
@@ -374,15 +372,16 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
}
if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags))
- afs_get_cell(new_root);
+ afs_use_cell(new_root, afs_cell_trace_use_pin);
/* install the new cell */
- write_seqlock(&net->cells_lock);
- old_root = rcu_access_pointer(net->ws_cell);
- rcu_assign_pointer(net->ws_cell, new_root);
- write_sequnlock(&net->cells_lock);
+ down_write(&net->cells_lock);
+ afs_see_cell(new_root, afs_cell_trace_see_ws);
+ old_root = net->ws_cell;
+ net->ws_cell = new_root;
+ up_write(&net->cells_lock);
- afs_put_cell(net, old_root);
+ afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
_leave(" = 0");
return 0;
}
@@ -488,18 +487,22 @@ out_wake:
static void afs_cell_destroy(struct rcu_head *rcu)
{
struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
+ struct afs_net *net = cell->net;
+ int u;
_enter("%p{%s}", cell, cell->name);
- ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+ u = atomic_read(&cell->ref);
+ ASSERTCMP(u, ==, 0);
+ trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
- afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
- afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
- afs_put_cell(cell->net, cell->alias_of);
+ afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
+ afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
key_put(cell->anonymous_key);
kfree(cell->name);
kfree(cell);
+ afs_dec_cells_outstanding(net);
_leave(" [destroyed]");
}
@@ -532,18 +535,63 @@ void afs_cells_timer(struct timer_list *timer)
/*
* Get a reference on a cell record.
*/
-struct afs_cell *afs_get_cell(struct afs_cell *cell)
+struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- atomic_inc(&cell->usage);
+ int u;
+
+ if (atomic_read(&cell->ref) <= 0)
+ BUG();
+
+ u = atomic_inc_return(&cell->ref);
+ trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
return cell;
}
/*
* Drop a reference on a cell record.
*/
-void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
+void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
+ if (cell) {
+ unsigned int debug_id = cell->debug_id;
+ unsigned int u, a;
+
+ a = atomic_read(&cell->active);
+ u = atomic_dec_return(&cell->ref);
+ trace_afs_cell(debug_id, u, a, reason);
+ if (u == 0) {
+ a = atomic_read(&cell->active);
+ WARN(a != 0, "Cell active count %u > 0\n", a);
+ call_rcu(&cell->rcu, afs_cell_destroy);
+ }
+ }
+}
+
+/*
+ * Note a cell becoming more active.
+ */
+struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ int u, a;
+
+ if (atomic_read(&cell->ref) <= 0)
+ BUG();
+
+ u = atomic_read(&cell->ref);
+ a = atomic_inc_return(&cell->active);
+ trace_afs_cell(cell->debug_id, u, a, reason);
+ return cell;
+}
+
+/*
+ * Record a cell becoming less active. When the active counter reaches 1, it
+ * is scheduled for destruction, but may get reactivated.
+ */
+void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ unsigned int debug_id = cell->debug_id;
time64_t now, expire_delay;
+ int u, a;
if (!cell)
return;
@@ -556,11 +604,35 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
if (cell->vl_servers->nr_servers)
expire_delay = afs_cell_gc_delay;
- if (atomic_dec_return(&cell->usage) > 1)
- return;
+ u = atomic_read(&cell->ref);
+ a = atomic_dec_return(&cell->active);
+ trace_afs_cell(debug_id, u, a, reason);
+ WARN_ON(a == 0);
+ if (a == 1)
+ /* 'cell' may now be garbage collected. */
+ afs_set_cell_timer(net, expire_delay);
+}
+
+/*
+ * Note that a cell has been seen.
+ */
+void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ int u, a;
+
+ u = atomic_read(&cell->ref);
+ a = atomic_read(&cell->active);
+ trace_afs_cell(cell->debug_id, u, a, reason);
+}
- /* 'cell' may now be garbage collected. */
- afs_set_cell_timer(net, expire_delay);
+/*
+ * Queue a cell for management, giving the workqueue a ref to hold.
+ */
+void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ afs_get_cell(cell, reason);
+ if (!queue_work(afs_wq, &cell->manager))
+ afs_put_cell(cell, afs_cell_trace_put_queue_fail);
}
/*
@@ -660,12 +732,10 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
* Manage a cell record, initialising and destroying it, maintaining its DNS
* records.
*/
-static void afs_manage_cell(struct work_struct *work)
+static void afs_manage_cell(struct afs_cell *cell)
{
- struct afs_cell *cell = container_of(work, struct afs_cell, manager);
struct afs_net *net = cell->net;
- bool deleted;
- int ret, usage;
+ int ret, active;
_enter("%s", cell->name);
@@ -674,14 +744,19 @@ again:
switch (cell->state) {
case AFS_CELL_INACTIVE:
case AFS_CELL_FAILED:
- write_seqlock(&net->cells_lock);
- usage = 1;
- deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0);
- if (deleted)
+ down_write(&net->cells_lock);
+ active = 1;
+ if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
rb_erase(&cell->net_node, &net->cells);
- write_sequnlock(&net->cells_lock);
- if (deleted)
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+ afs_cell_trace_unuse_delete);
+ smp_store_release(&cell->state, AFS_CELL_REMOVED);
+ }
+ up_write(&net->cells_lock);
+ if (cell->state == AFS_CELL_REMOVED) {
+ wake_up_var(&cell->state);
goto final_destruction;
+ }
if (cell->state == AFS_CELL_FAILED)
goto done;
smp_store_release(&cell->state, AFS_CELL_UNSET);
@@ -703,7 +778,7 @@ again:
goto again;
case AFS_CELL_ACTIVE:
- if (atomic_read(&cell->usage) > 1) {
+ if (atomic_read(&cell->active) > 1) {
if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
ret = afs_update_cell(cell);
if (ret < 0)
@@ -716,13 +791,16 @@ again:
goto again;
case AFS_CELL_DEACTIVATING:
- if (atomic_read(&cell->usage) > 1)
+ if (atomic_read(&cell->active) > 1)
goto reverse_deactivation;
afs_deactivate_cell(net, cell);
smp_store_release(&cell->state, AFS_CELL_INACTIVE);
wake_up_var(&cell->state);
goto again;
+ case AFS_CELL_REMOVED:
+ goto done;
+
default:
break;
}
@@ -748,9 +826,18 @@ done:
return;
final_destruction:
- call_rcu(&cell->rcu, afs_cell_destroy);
- afs_dec_cells_outstanding(net);
- _leave(" [destruct %d]", atomic_read(&net->cells_outstanding));
+ /* The root volume is pinning the cell */
+ afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
+ cell->root_volume = NULL;
+ afs_put_cell(cell, afs_cell_trace_put_destroy);
+}
+
+static void afs_manage_cell_work(struct work_struct *work)
+{
+ struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+
+ afs_manage_cell(cell);
+ afs_put_cell(cell, afs_cell_trace_put_queue_work);
}
/*
@@ -779,26 +866,29 @@ void afs_manage_cells(struct work_struct *work)
* lack of use and cells whose DNS results have expired and dispatch
* their managers.
*/
- read_seqlock_excl(&net->cells_lock);
+ down_read(&net->cells_lock);
for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
struct afs_cell *cell =
rb_entry(cursor, struct afs_cell, net_node);
- unsigned usage;
+ unsigned active;
bool sched_cell = false;
- usage = atomic_read(&cell->usage);
- _debug("manage %s %u", cell->name, usage);
+ active = atomic_read(&cell->active);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ active, afs_cell_trace_manage);
- ASSERTCMP(usage, >=, 1);
+ ASSERTCMP(active, >=, 1);
if (purging) {
- if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- usage = atomic_dec_return(&cell->usage);
- ASSERTCMP(usage, ==, 1);
+ if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
+ active = atomic_dec_return(&cell->active);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ active, afs_cell_trace_unuse_pin);
+ }
}
- if (usage == 1) {
+ if (active == 1) {
struct afs_vlserver_list *vllist;
time64_t expire_at = cell->last_inactive;
@@ -821,10 +911,10 @@ void afs_manage_cells(struct work_struct *work)
}
if (sched_cell)
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
}
- read_sequnlock_excl(&net->cells_lock);
+ up_read(&net->cells_lock);
/* Update the timer on the way out. We have to pass an increment on
* cells_outstanding in the namespace that we are in to the timer or
@@ -854,11 +944,11 @@ void afs_cell_purge(struct afs_net *net)
_enter("");
- write_seqlock(&net->cells_lock);
- ws = rcu_access_pointer(net->ws_cell);
- RCU_INIT_POINTER(net->ws_cell, NULL);
- write_sequnlock(&net->cells_lock);
- afs_put_cell(net, ws);
+ down_write(&net->cells_lock);
+ ws = net->ws_cell;
+ net->ws_cell = NULL;
+ up_write(&net->cells_lock);
+ afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
_debug("del timer");
if (del_timer_sync(&net->cells_timer))
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 7b784af604fd..db832cc931c8 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -123,9 +123,9 @@ static int afs_probe_cell_name(struct dentry *dentry)
len--;
}
- cell = afs_lookup_cell_rcu(net, name, len);
+ cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
if (!IS_ERR(cell)) {
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
return 0;
}
@@ -179,7 +179,6 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry)
struct afs_cell *cell;
struct afs_net *net = afs_d2net(dentry);
struct dentry *ret;
- unsigned int seq = 0;
char *name;
int len;
@@ -191,17 +190,13 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry)
if (!name)
goto out_p;
- rcu_read_lock();
- do {
- read_seqbegin_or_lock(&net->cells_lock, &seq);
- cell = rcu_dereference_raw(net->ws_cell);
- if (cell) {
- len = cell->name_len;
- memcpy(name, cell->name, len + 1);
- }
- } while (need_seqretry(&net->cells_lock, seq));
- done_seqretry(&net->cells_lock, seq);
- rcu_read_unlock();
+ down_read(&net->cells_lock);
+ cell = net->ws_cell;
+ if (cell) {
+ len = cell->name_len;
+ memcpy(name, cell->name, len + 1);
+ }
+ up_read(&net->cells_lock);
ret = ERR_PTR(-ENOENT);
if (!cell)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1d13d2e882ad..0fe8844b4bee 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
- struct inode *inode = &op->file[0].vnode->vfs_inode;
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
+ loff_t old_i_size = i_size_read(inode);
+
+ op->setattr.old_i_size = old_i_size;
+ afs_vnode_commit_status(op, vp);
+ /* inode->i_size has now been changed. */
+
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t size = op->setattr.attr->ia_size;
+ if (size > old_i_size)
+ pagecache_isize_extended(inode, old_i_size, size);
+ }
+}
+
+static void afs_setattr_edit_file(struct afs_operation *op)
+{
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
- afs_vnode_commit_status(op, &op->file[0]);
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
- loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
- if (size > i_size)
- pagecache_isize_extended(inode, i_size, size);
- truncate_pagecache(inode, size);
+ loff_t size = op->setattr.attr->ia_size;
+ loff_t i_size = op->setattr.old_i_size;
+
+ if (size < i_size)
+ truncate_pagecache(inode, size);
}
}
@@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = {
.issue_afs_rpc = afs_fs_setattr,
.issue_yfs_rpc = yfs_fs_setattr,
.success = afs_setattr_success,
+ .edit_dir = afs_setattr_edit_file,
};
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ /* Prevent any new writebacks from starting whilst we do this. */
+ down_write(&vnode->validate_lock);
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
- if (IS_ERR(op))
- return PTR_ERR(op);
+ if (IS_ERR(op)) {
+ ret = PTR_ERR(op);
+ goto out_unlock;
+ }
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
- return afs_do_sync_operation(op);
+ ret = afs_do_sync_operation(op);
+
+out_unlock:
+ up_write(&vnode->validate_lock);
+ _leave(" = %d", ret);
+ return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 18042b7dab6a..81b0485fd22a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -263,11 +263,11 @@ struct afs_net {
/* Cell database */
struct rb_root cells;
- struct afs_cell __rcu *ws_cell;
+ struct afs_cell *ws_cell;
struct work_struct cells_manager;
struct timer_list cells_timer;
atomic_t cells_outstanding;
- seqlock_t cells_lock;
+ struct rw_semaphore cells_lock;
struct mutex cells_alias_lock;
struct mutex proc_cells_lock;
@@ -326,6 +326,7 @@ enum afs_cell_state {
AFS_CELL_DEACTIVATING,
AFS_CELL_INACTIVE,
AFS_CELL_FAILED,
+ AFS_CELL_REMOVED,
};
/*
@@ -363,7 +364,8 @@ struct afs_cell {
#endif
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
- atomic_t usage;
+ atomic_t ref; /* Struct refcount */
+ atomic_t active; /* Active usage counter */
unsigned long flags;
#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */
#define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */
@@ -373,6 +375,7 @@ struct afs_cell {
enum dns_record_source dns_source:8; /* Latest source of data from lookup */
enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */
unsigned int dns_lookup_count; /* Counter of DNS lookups */
+ unsigned int debug_id;
/* The volumes belonging to this cell */
struct rb_root volumes; /* Tree of volumes on this server */
@@ -812,6 +815,7 @@ struct afs_operation {
} store;
struct {
struct iattr *attr;
+ loff_t old_i_size;
} setattr;
struct afs_acl *acl;
struct yfs_acl *yacl;
@@ -916,11 +920,16 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
* cell.c
*/
extern int afs_cell_init(struct afs_net *, const char *);
-extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned);
+extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
+ enum afs_cell_trace);
extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
const char *, bool);
-extern struct afs_cell *afs_get_cell(struct afs_cell *);
-extern void afs_put_cell(struct afs_net *, struct afs_cell *);
+extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_manage_cells(struct work_struct *);
extern void afs_cells_timer(struct timer_list *);
extern void __net_exit afs_cell_purge(struct afs_net *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 31b472f7c734..accdd8970e7c 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -78,7 +78,7 @@ static int __net_init afs_net_init(struct net *net_ns)
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
- seqlock_init(&net->cells_lock);
+ init_rwsem(&net->cells_lock);
INIT_WORK(&net->cells_manager, afs_manage_cells);
timer_setup(&net->cells_timer, afs_cells_timer, 0);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 79bc5f1338ed..052dab2f5c03 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -88,7 +88,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->force = true;
}
if (ctx->cell) {
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
ctx->cell = NULL;
}
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -124,7 +124,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
char *buf;
if (src_as->cell)
- ctx->cell = afs_get_cell(src_as->cell);
+ ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
if (size < 2 || size > PAGE_SIZE - 1)
return -EINVAL;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e8babb62ed44..065a28bfa3f1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -38,7 +38,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
if (v == SEQ_START_TOKEN) {
/* display header on line 1 */
- seq_puts(m, "USE TTL SV ST NAME\n");
+ seq_puts(m, "USE ACT TTL SV ST NAME\n");
return 0;
}
@@ -46,10 +46,11 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
vllist = rcu_dereference(cell->vl_servers);
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3u %6lld %2u %2u %s\n",
- atomic_read(&cell->usage),
+ seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
+ atomic_read(&cell->ref),
+ atomic_read(&cell->active),
cell->dns_expiry - ktime_get_real_seconds(),
- vllist->nr_servers,
+ vllist ? vllist->nr_servers : 0,
cell->state,
cell->name);
return 0;
@@ -128,7 +129,7 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
}
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
} else {
goto inval;
}
@@ -154,13 +155,11 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v)
struct afs_net *net;
net = afs_seq2net_single(m);
- if (rcu_access_pointer(net->ws_cell)) {
- rcu_read_lock();
- cell = rcu_dereference(net->ws_cell);
- if (cell)
- seq_printf(m, "%s\n", cell->name);
- rcu_read_unlock();
- }
+ down_read(&net->cells_lock);
+ cell = net->ws_cell;
+ if (cell)
+ seq_printf(m, "%s\n", cell->name);
+ up_read(&net->cells_lock);
return 0;
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index e82e452e2612..684a2b02b9ff 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -550,7 +550,12 @@ void afs_manage_servers(struct work_struct *work)
_debug("manage %pU %u", &server->uuid, active);
- ASSERTIFCMP(purging, active, ==, 0);
+ if (purging) {
+ trace_afs_server(server, atomic_read(&server->ref),
+ active, afs_server_trace_purging);
+ if (active != 0)
+ pr_notice("Can't purge s=%08x\n", server->debug_id);
+ }
if (active == 0) {
time64_t expire_at = server->unuse_time;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b552357b1d13..6c5900df6aa5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -294,7 +294,8 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+ afs_see_cell(cell, afs_cell_trace_see_source);
ctx->cell = cell;
}
@@ -389,8 +390,9 @@ static int afs_validate_fc(struct fs_context *fc)
_debug("switch to alias");
key_put(ctx->key);
ctx->key = NULL;
- cell = afs_get_cell(ctx->cell->alias_of);
- afs_put_cell(ctx->net, ctx->cell);
+ cell = afs_use_cell(ctx->cell->alias_of,
+ afs_cell_trace_use_fc_alias);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
ctx->cell = cell;
goto reget_key;
}
@@ -456,7 +458,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -508,7 +509,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
if (ctx->dyn_root) {
as->dyn_root = true;
} else {
- as->cell = afs_get_cell(ctx->cell);
+ as->cell = afs_use_cell(ctx->cell, afs_cell_trace_use_sbi);
as->volume = afs_get_volume(ctx->volume,
afs_volume_trace_get_alloc_sbi);
}
@@ -521,7 +522,7 @@ static void afs_destroy_sbi(struct afs_super_info *as)
if (as) {
struct afs_net *net = afs_net(as->net_ns);
afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi);
- afs_put_cell(net, as->cell);
+ afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
}
@@ -607,7 +608,7 @@ static void afs_free_fc(struct fs_context *fc)
afs_destroy_sbi(fc->s_fs_info);
afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc);
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
}
@@ -634,9 +635,7 @@ static int afs_init_fs_context(struct fs_context *fc)
ctx->net = afs_net(fc->net_ns);
/* Default to the workstation cell. */
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
- rcu_read_unlock();
+ cell = afs_find_cell(ctx->net, NULL, 0, afs_cell_trace_use_fc);
if (IS_ERR(cell))
cell = NULL;
ctx->cell = cell;
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 5082ef04e99c..f04a80e4f5c3 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -177,7 +177,7 @@ static int afs_compare_cell_roots(struct afs_cell *cell)
is_alias:
rcu_read_unlock();
- cell->alias_of = afs_get_cell(p);
+ cell->alias_of = afs_use_cell(p, afs_cell_trace_use_alias);
return 1;
}
@@ -247,18 +247,18 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
continue;
if (p->root_volume)
continue; /* Ignore cells that have a root.cell volume. */
- afs_get_cell(p);
+ afs_use_cell(p, afs_cell_trace_use_check_alias);
mutex_unlock(&cell->net->proc_cells_lock);
if (afs_query_for_alias_one(cell, key, p) != 0)
goto is_alias;
if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
- afs_put_cell(cell->net, p);
+ afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
return -ERESTARTSYS;
}
- afs_put_cell(cell->net, p);
+ afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
}
mutex_unlock(&cell->net->proc_cells_lock);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index c0458c903b31..488e58490b16 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -45,7 +45,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
cell->dns_expiry <= ktime_get_real_seconds()) {
dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
if (wait_var_event_interruptible(
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 9bc0509e3634..f84194b791d3 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -83,7 +83,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->vid = vldb->vid[params->type];
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
- volume->cell = afs_get_cell(params->cell);
+ volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol);
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
@@ -106,7 +106,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
return volume;
error_1:
- afs_put_cell(params->net, volume->cell);
+ afs_put_cell(volume->cell, afs_cell_trace_put_vol);
kfree(volume);
error_0:
return ERR_PTR(ret);
@@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(net, rcu_access_pointer(volume->servers));
- afs_put_cell(net, volume->cell);
+ afs_put_cell(volume->cell, afs_cell_trace_put_vol);
trace_afs_volume(volume->vid, atomic_read(&volume->usage),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4b2265cb1891..da12abd6db21 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
pgoff_t start, end, next;
int ret;
_enter("");
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ down_read(&vnode->validate_lock);
+ else if (!down_read_trylock(&vnode->validate_lock))
+ return 0;
+
if (wbc->range_cyclic) {
start = mapping->writeback_index;
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
+ up_read(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/aio.c b/fs/aio.c
index d5ec30385566..c45c20d87538 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 75105f45c51a..322b7dfb4ea0 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -8,6 +8,7 @@
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/magic.h>
+#include <linux/nospec.h>
#include "autofs_i.h"
@@ -563,7 +564,7 @@ out:
static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
{
- static ioctl_fn _ioctls[] = {
+ static const ioctl_fn _ioctls[] = {
autofs_dev_ioctl_version,
autofs_dev_ioctl_protover,
autofs_dev_ioctl_protosubver,
@@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
};
unsigned int idx = cmd_idx(cmd);
- return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
+ if (idx >= ARRAY_SIZE(_ioctls))
+ return NULL;
+ idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls));
+ return _ioctls[idx];
}
/* ioctl dispatcher */
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 74c886f7c51c..5ced859dac53 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = kernel_write(file, data, bytes, &file->f_pos);
+ wr = __kernel_write(file, data, bytes, NULL);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13d053982dd7..b6b3d052ca86 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/errno.h>
@@ -309,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
* Grow the stack manually; some architectures have a limit on how
* far ahead a user-space access may be in order to grow the stack.
*/
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
vma = find_extend_vma(mm, bprm->p);
+ mmap_read_unlock(mm);
if (!vma)
return -EFAULT;
@@ -421,6 +425,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
return 0;
}
+static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
+{
+ unsigned long alignment = 0;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (cmds[i].p_type == PT_LOAD) {
+ unsigned long p_align = cmds[i].p_align;
+
+ /* skip non-power of two alignments as invalid */
+ if (!is_power_of_2(p_align))
+ continue;
+ alignment = max(alignment, p_align);
+ }
+ }
+
+ /* ensure we align to at least one page */
+ return ELF_PAGEALIGN(alignment);
+}
+
/**
* load_elf_phdrs() - load ELF program headers
* @elf_ex: ELF header of the binary whose program headers should be loaded
@@ -1008,6 +1032,7 @@ out_free_interp:
int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
+ unsigned long alignment;
if (elf_ppnt->p_type != PT_LOAD)
continue;
@@ -1086,6 +1111,9 @@ out_free_interp:
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ if (alignment)
+ load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED;
} else
load_bias = 0;
@@ -1389,126 +1417,6 @@ out:
* Jeremy Fitzhardinge <jeremy@sw.oz.au>
*/
-/*
- * The purpose of always_dump_vma() is to make sure that special kernel mappings
- * that are useful for post-mortem analysis are included in every core dump.
- * In that way we ensure that the core dump is fully interpretable later
- * without matching up the same kernel and hardware config to see what PC values
- * meant. These special mappings include - vDSO, vsyscall, and other
- * architecture specific mappings
- */
-static bool always_dump_vma(struct vm_area_struct *vma)
-{
- /* Any vsyscall mappings? */
- if (vma == get_gate_vma(vma->vm_mm))
- return true;
-
- /*
- * Assume that all vmas with a .name op should always be dumped.
- * If this changes, a new vm_ops field can easily be added.
- */
- if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
- return true;
-
- /*
- * arch_vma_name() returns non-NULL for special architecture mappings,
- * such as vDSO sections.
- */
- if (arch_vma_name(vma))
- return true;
-
- return false;
-}
-
-/*
- * Decide what to dump of a segment, part, all or none.
- */
-static unsigned long vma_dump_size(struct vm_area_struct *vma,
- unsigned long mm_flags)
-{
-#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
-
- /* always dump the vdso and vsyscall sections */
- if (always_dump_vma(vma))
- goto whole;
-
- if (vma->vm_flags & VM_DONTDUMP)
- return 0;
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Hugetlb memory check */
- if (is_vm_hugetlb_page(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO)
- return 0;
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0 ?
- FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
- goto whole;
- return 0;
- }
-
- /* Dump segments that have been written to. */
- if (vma->anon_vma && FILTER(ANON_PRIVATE))
- goto whole;
- if (vma->vm_file == NULL)
- return 0;
-
- if (FILTER(MAPPED_PRIVATE))
- goto whole;
-
- /*
- * If this looks like the beginning of a DSO or executable mapping,
- * check for an ELF header. If we find one, dump the first page to
- * aid in determining what was mapped here.
- */
- if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
- u32 __user *header = (u32 __user *) vma->vm_start;
- u32 word;
- /*
- * Doing it this way gets the constant folded by GCC.
- */
- union {
- u32 cmp;
- char elfmag[SELFMAG];
- } magic;
- BUILD_BUG_ON(SELFMAG != sizeof word);
- magic.elfmag[EI_MAG0] = ELFMAG0;
- magic.elfmag[EI_MAG1] = ELFMAG1;
- magic.elfmag[EI_MAG2] = ELFMAG2;
- magic.elfmag[EI_MAG3] = ELFMAG3;
- if (unlikely(get_user(word, header)))
- word = 0;
- if (word == magic.cmp)
- return PAGE_SIZE;
- }
-
-#undef FILTER
-
- return 0;
-
-whole:
- return vma->vm_end - vma->vm_start;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -2220,32 +2128,6 @@ static void free_note_info(struct elf_note_info *info)
#endif
-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-/*
- * Helper function for iterating across a vma list. It ensures that the caller
- * will visit `gate_vma' prior to terminating the search.
- */
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
- return NULL;
- return gate_vma;
-}
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2272,9 +2154,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs, i;
- size_t vma_data_size = 0;
- struct vm_area_struct *vma, *gate_vma;
+ int vma_count, segs, i;
+ size_t vma_data_size;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2282,30 +2163,16 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- elf_addr_t *vma_filesz = NULL;
+ struct core_vma_metadata *vma_meta;
+
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ return 0;
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
-
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
-
- gate_vma = get_gate_vma(current->mm);
- if (gate_vma != NULL)
- segs++;
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2343,24 +2210,6 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- /*
- * Zero vma process will get ZERO_SIZE_PTR here.
- * Let coredump continue for register state at least.
- */
- vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)),
- GFP_KERNEL);
- if (!vma_filesz)
- goto end_coredump;
-
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long dump_size;
-
- dump_size = vma_dump_size(vma, cprm->mm_flags);
- vma_filesz[i++] = dump_size;
- vma_data_size += dump_size;
- }
-
offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2381,21 +2230,23 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = vma_filesz[i++];
- phdr.p_memsz = vma->vm_end - vma->vm_start;
+ phdr.p_filesz = meta->dump_size;
+ phdr.p_memsz = meta->end - meta->start;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -2417,28 +2268,11 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long addr;
- unsigned long end;
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- end = vma->vm_start + vma_filesz[i++];
-
- for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
- struct page *page;
- int stop;
-
- page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else
- stop = !dump_skip(cprm, PAGE_SIZE);
- if (stop)
- goto end_coredump;
- }
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
+ goto end_coredump;
}
dump_truncate(cprm);
@@ -2453,7 +2287,7 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_filesz);
+ kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 50f845702b92..be4062b8ba75 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic
int pr_fpvalid; /* True if math co-processor being used. */
};
-/*
- * Decide whether a segment is worth dumping; default is yes to be
- * sure (missing info is worse than too much; etc).
- * Personally I'd include everything, and use the coredump limit...
- *
- * I think we should skip something. But I am not sure how. H.J.
- */
-static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
-{
- int dump_ok;
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO) {
- kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* If we may not read the contents, don't allow us to dump
- * them either. "dump_write()" can't handle it anyway.
- */
- if (!(vma->vm_flags & VM_READ)) {
- kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if (vma->vm_flags & VM_SHARED) {
- dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- } else {
- dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- }
- return dump_ok;
- }
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0) {
- dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
- dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
-#ifdef CONFIG_MMU
- /* By default, if it hasn't been written to, don't write it out */
- if (!vma->anon_vma) {
- dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-#endif
-
- dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags,
- dump_ok ? "yes" : "no");
- return dump_ok;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
/*
* dump the segments for an MMU process
*/
-static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
+ struct core_vma_metadata *vma_meta,
+ int vma_count)
{
- struct vm_area_struct *vma;
+ int i;
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-#ifdef CONFIG_MMU
- unsigned long addr;
-#endif
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- if (!maydump(vma, cprm->mm_flags))
- continue;
-
-#ifdef CONFIG_MMU
- for (addr = vma->vm_start; addr < vma->vm_end;
- addr += PAGE_SIZE) {
- bool res;
- struct page *page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- res = dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else {
- res = dump_skip(cprm, PAGE_SIZE);
- }
- if (!res)
- return false;
- }
-#else
- if (!dump_emit(cprm, (void *) vma->vm_start,
- vma->vm_end - vma->vm_start))
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
return false;
-#endif
}
return true;
}
-static size_t elf_core_vma_data_size(unsigned long mm_flags)
-{
- struct vm_area_struct *vma;
- size_t size = 0;
-
- for (vma = current->mm->mmap; vma; vma = vma->vm_next)
- if (maydump(vma, mm_flags))
- size += vma->vm_end - vma->vm_start;
- return size;
-}
-
/*
* Actual dumper
*
@@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs;
+ int vma_count, segs;
int i;
- struct vm_area_struct *vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
struct memelfnote psinfo_note, auxv_note;
@@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
-
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
+ struct core_vma_metadata *vma_meta = NULL;
+ size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ goto end_coredump;
+
for (ct = current->mm->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += elf_core_vma_data_size(cprm->mm_flags);
+ offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
size_t sz;
- sz = vma->vm_end - vma->vm_start;
+ sz = meta->end - meta->start;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
+ phdr.p_filesz = meta->dump_size;
phdr.p_memsz = sz;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- if (!elf_fdpic_dump_segments(cprm))
+ if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1776,6 +1667,7 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
+ kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ae833e00443..9e84b1928b94 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ return err;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-struct block_device *bdget(dev_t dev)
+static struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
@@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev)
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev)
return bdev;
}
-EXPORT_SYMBOL(bdget);
-
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
@@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev)
}
EXPORT_SYMBOL(bdgrab);
+struct block_device *bdget_part(struct hd_struct *part)
+{
+ return bdget(part_devt(part));
+}
+
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk,
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
pr_warn("VFS: busy inodes on resized disk %s\n",
disk->disk_name);
}
- bdev->bd_invalidated = 0;
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
- int ret = 0;
-
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ struct block_device *bdev;
/*
* Hidden disks don't have associated bdev so there's no point in
- * revalidating it.
+ * revalidating them.
*/
- if (!(disk->flags & GENHD_FL_HIDDEN)) {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (!bdev)
- return ret;
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- mutex_unlock(&bdev->bd_mutex);
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
bdput(bdev);
}
- return ret;
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
- bdev->bd_invalidated = 1;
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
-}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
@@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+
rescan:
ret = blk_drop_partitions(bdev);
if (ret)
@@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
bool first_open = false, unblock_events = true, need_restart;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
-
restart:
need_restart = false;
ret = -ENXIO;
@@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
@@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- int res;
+ int ret, perm = 0;
- res =__blkdev_get(bdev, mode, holder, 0);
- if (res)
- bdput(bdev);
- return res;
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
+
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
+
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea1c28ccb44f..b3268f4ea5f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
while (!list_empty(&pending_edge)) {
struct btrfs_backref_node *upper;
struct btrfs_backref_node *lower;
- struct rb_node *rb_node;
edge = list_first_entry(&pending_edge,
struct btrfs_backref_edge, list[UPPER]);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ea8aaf36647e..c0f1d6818df7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1873,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
+static void read_block_group_item(struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct btrfs_key *key)
{
@@ -1887,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
sizeof(bgi));
cache->used = btrfs_stack_block_group_used(&bgi);
cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
}
static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1907,9 +1899,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ read_block_group_item(cache, path, key);
set_free_space_tree_thresholds(cache);
@@ -2035,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2056,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2097,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2111,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2785,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
@@ -2961,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2994,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -3002,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3338,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c47b6c6fea9f..92dd86bceae3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,14 +21,18 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..eeface30facd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
-
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9f3dbe372631..8001b700ea3a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cd392da69b81..113da62dc17f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1446,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1485,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1657,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1855,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1891,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1906,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2069,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2077,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2123,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2132,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2601,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2740,11 +2755,13 @@ again:
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2875,7 +2892,8 @@ cow_done:
} else {
if (!btrfs_tree_read_lock_atomic(b)) {
btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -3164,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3207,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3275,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3331,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3461,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3730,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3739,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3751,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3963,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -3974,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -3988,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4236,8 +4330,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4482,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4657,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4675,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4701,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4734,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
data_end -= data_size[i];
+ btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
@@ -4777,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -5115,7 +5229,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5379,7 +5493,9 @@ again:
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5414,7 +5530,9 @@ again:
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9a72896bed2e..aac3d6f4e35b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,6 +374,7 @@ struct btrfs_path {
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -541,11 +542,6 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -565,6 +561,19 @@ enum {
BTRFS_FS_DISCARD_RUNNING,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -912,6 +921,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -935,6 +945,9 @@ struct btrfs_fs_info {
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1181,24 +1194,40 @@ struct btrfs_root {
#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
@@ -2524,7 +2561,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum {
*
* Can be interruped by fatal signal.
*/
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
@@ -2619,7 +2659,7 @@ enum btrfs_flush_state {
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2930,6 +2969,10 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
@@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
@@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0e354e9e57d0..bacee09b7bfd 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf1595a42a98..5aba81e16113 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index db93909b25e0..4a0243cb9d97 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -64,10 +64,6 @@
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index abf86b202b43..8e3438672a82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
@@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info,
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- return extent_read_full_page(page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page)
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
btrfs_put_root(root);
return ERR_CAST(leaf);
@@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
- btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
- root->root_key.objectid, root->root_key.offset,
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(root->root_key.objectid, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
+ if (IS_ERR(tree_root->node)) {
handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
- if (IS_ERR(tree_root->node)) {
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
- ret = -EUCLEAN;
- }
-
- btrfs_warn(fs_info, "failed to read tree root");
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
continue;
}
@@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
@@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3483,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_CAST(page);
super = page_address(page);
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -4057,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4688,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 00dc39d47ed3..fee69ced58b4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 219a09a2b734..9800a8306368 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -40,6 +40,7 @@ struct io_failure_record;
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
@@ -48,6 +49,7 @@ enum {
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
@@ -61,7 +63,6 @@ struct extent_io_tree {
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 780b9c9a98fe..3b21fee13e77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -2849,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2935,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2967,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -2976,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -2992,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3009,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3083,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3107,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3122,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3169,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3918,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
@@ -3954,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
@@ -4203,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
@@ -4504,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4527,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
btrfs_set_buffer_lockdep_class(owner, buf, level);
- btrfs_tree_lock(buf);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4573,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4589,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4606,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a940edb1e64f..60f5f68d892d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
@@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (!btrfs_submit_read_repair(inode, bio, offset, page,
start - page_offset(page),
start, end, mirror,
- tree->ops->submit_bio_hook)) {
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page)
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page,
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -4552,7 +4516,7 @@ next:
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
@@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4898,7 +4861,7 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5622,6 +5584,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
offset = offset_in_page(start);
@@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
offset = offset_in_page(start);
@@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
char *kaddr;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst,
char *kaddr;
unsigned long i = dst_offset >> PAGE_SHIFT;
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
WARN_ON(src->len != dst_len);
offset = offset_in_page(dst_offset);
@@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
dst_off_in_page = offset_in_page(dst_offset);
@@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 30794ae58498..f39d02e7f7ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- submit_bio_hook_t *submit_bio_hook;
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -102,7 +90,7 @@ struct extent_buffer {
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7d5ec71615b8..8f4f2bd6d9b9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4507c3d09399..0ff659455b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1057,11 +1057,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges and races between logging and
- * completion of ordered extents for adjancent ranges - both races
- * could lead to file extent items in the log with overlapping ranges.
- * Do this while holding the inode lock, to avoid races with other
- * tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+ (full_sync || list_empty(&ctx.ordered_extents)))) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2286,6 +2335,12 @@ out:
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- clone_info->file_offset, clone_len);
+ extent_info->file_offset, replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* returned by __btrfs_drop_extents() without having
* changed anything in the file.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (extent_info && !extent_info->is_new_extent &&
+ ret && ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
@@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -3044,7 +3119,7 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
@@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dc82fd0c80cb..af0013d3df63 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -1353,7 +1350,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9570458aa847..936c3137c646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,7 +6,6 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -31,6 +30,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -59,9 +59,10 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -2183,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2245,16 +2237,15 @@ out:
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2357,7 +2348,7 @@ again:
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2548,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
@@ -2568,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
- return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
- &stack_fi, oe->qgroup_rsv);
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
}
/*
@@ -2666,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_ordered_extent_file_extent(trans, inode,
- ordered_extent);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2683,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2752,7 +2742,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2772,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -2784,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -2833,9 +2823,8 @@ zeroit:
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3055,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3395,7 +3383,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4051,7 +4038,7 @@ out_end_trans:
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4583,7 +4570,7 @@ again:
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -4848,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5305,15 +5289,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -6311,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6374,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6389,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -6540,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6569,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6579,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6592,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6625,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
@@ -6643,12 +6621,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6699,10 +6676,8 @@ next:
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6726,29 +6701,28 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -7111,7 +7085,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7160,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7249,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7333,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
@@ -7344,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
}
+ iomap->private = dio_data;
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7433,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
}
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
free_extent_map(em);
@@ -7471,8 +7457,63 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}
@@ -7496,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}
@@ -7730,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7764,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7772,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
if (!write && csum) {
@@ -7844,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7888,37 +7919,59 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7926,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, offset, dio_data.reserve,
- true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(BTRFS_I(inode),
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -8002,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- return extent_read_full_page(page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8091,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping,
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -8110,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
@@ -8120,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
lock_extent_bits(tree, page_start, page_end, &cached_state);
again:
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
@@ -8142,7 +8168,7 @@ again:
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8181,7 +8207,7 @@ again:
* bit of its io_tree, and free the qgroup reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8283,7 +8309,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8614,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8643,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
@@ -8651,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
- btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
- btrfs_put_root(BTRFS_I(inode)->root);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8780,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8942,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -8998,46 +9000,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -9105,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9259,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9306,23 +9264,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9388,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9428,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
+ if (*nr != U64_MAX) {
+ (*nr)--;
+ if (*nr == 0)
+ goto out;
+ }
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9455,18 +9400,15 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &nr, true);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -9489,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
+ ret = start_delalloc_inodes(root, &nr, false);
btrfs_put_root(root);
if (ret < 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9568,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9633,11 +9569,15 @@ out_unlock:
return err;
}
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
struct inode *inode, struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int ret;
@@ -9654,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
if (ret < 0)
- return ret;
- return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
- &stack_fi, ret);
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9680,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9699,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9716,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -9785,8 +9742,10 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9865,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10072,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
@@ -10225,7 +10183,7 @@ out:
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10283,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10306,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac45f022b495..ab408a23ba32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags)
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -742,7 +755,7 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
err = btrfs_commit_transaction(trans);
if (err && !ret)
@@ -856,7 +869,7 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -1306,7 +1319,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
@@ -2193,7 +2206,8 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = fault_in_pages_writeable(ubuf, *buf_size - sk_offset);
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
if (ret)
break;
@@ -3125,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3142,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3171,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3182,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3213,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3231,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3461,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3517,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3735,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -3950,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -3996,7 +4004,6 @@ again:
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4051,10 +4058,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4073,7 +4080,7 @@ out_bargs:
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -4896,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f75612e18a82..66e02ebdd340 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
* performance reasons.
*
*
- * Lock nesting
- * ------------
+ * Lock recursion
+ * --------------
*
* A write operation on a tree might indirectly start a look up on the same
* tree. This can happen when btrfs_cow_block locks the tree and needs to
@@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
@@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*
* The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -263,8 +264,9 @@ again:
* depends on this as it may be called on a partly
* (write-)locked tree.
*/
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
read_unlock(&eb->lock);
trace_btrfs_tree_read_lock(eb, start_ns);
return;
@@ -279,6 +281,11 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
* Lock extent buffer for read, optimistically expecting that there are no
* contending blocking writers. If there are, don't wait.
@@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*
* The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -434,6 +441,11 @@ again:
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
* Release the write lock, either blocking or spinning (ie. there's no need
* for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
@@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d715846c10b8..3ea81ed3320b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -16,11 +16,81 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
struct btrfs_path;
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
@@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ebac13389e7e..87bac9ecdf4c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
+ trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -377,17 +378,16 @@ out:
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -408,7 +408,7 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -775,17 +803,45 @@ out:
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,20 +859,21 @@ out:
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
return 0;
@@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d61ea9c880a3..c3a2325e64a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,6 +56,12 @@ enum {
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
@@ -104,6 +110,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
@@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 80567c11ec12..7695c4783d33 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(u64 objectid, char *buf)
+{
+ int i;
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", objectid);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..78b99385a503 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(u64 objectid, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0f350c3a0cf..580899bdb991 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 243a2e44526e..9d4f5316a7e8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 5cd02514cf4d..99aa87c08912 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -45,7 +45,7 @@ out:
return ret;
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
@@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode,
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- file_offset, block_size);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
- 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -134,9 +134,9 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- file_offset, block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
@@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -231,8 +231,8 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -439,7 +439,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -462,8 +462,8 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
@@ -520,6 +520,8 @@ process_slot:
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -533,7 +535,7 @@ process_slot:
btrfs_release_path(path);
path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ba1ab9cc76d..3602806d71bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1206,7 +1206,8 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1274,7 +1275,8 @@ again:
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c89697486366..702dc5441f03 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 354ab9985a34..cf63f1e27a27 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9813a5b075a..340c76a12ce1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@ struct send_ctx {
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -278,11 +276,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4799,7 +4911,25 @@ out:
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5033,8 +5150,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7378,7 +7466,6 @@ out:
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..de91488b7cd0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..64099565ab8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
@@ -476,28 +465,6 @@ again:
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, items);
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -596,14 +551,6 @@ skip_async:
}
spin_unlock(&space_info->lock);
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
@@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c3c64019950a..5646393b928c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 079b059818e9..c46be27be700 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,16 +7,6 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
{
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 25967ecaaf0a..8840a4fa81eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -2163,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2193,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c8df2edafd85..279d9262b676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,6 +14,7 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
@@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- btrfs_reset_fs_info_ptr(fs_info);
-
sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
@@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(fsid_kobj, btrfs_attrs);
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- return 0;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
-
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (one_device && one_device != dev)
- continue;
+ nofs_flag = memalloc_nofs_save();
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ if (device->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
- }
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
}
+
+out:
memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
- return error;
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index cf839c46a131..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..df54cdfdc250 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 894a63a92236..e6719f7db386 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d2fc292ac61b..52ada47aff50 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,6 +292,8 @@ loop:
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -2165,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d60b055b8695..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -85,6 +85,13 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b1fee630f97..f0ffd5ee77bd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (ret < 0)
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size_nr(leaf, slot));
/* Generation related */
if (btrfs_root_generation(&ri) >
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39da9db35278..56cbc1706b6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,6 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -3615,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3634,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -4082,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4094,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4140,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4342,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4359,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4434,8 +4483,32 @@ process:
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4841,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4883,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -4899,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5112,8 +5185,6 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5292,7 +5363,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5301,31 +5372,8 @@ log_extents:
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5339,19 +5387,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5591,7 +5654,7 @@ process_leaf:
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5735,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5786,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5842,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5950,8 +6012,6 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6004,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6100,15 +6160,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6371,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6405,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..731bd9c029f5 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,8 +16,11 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 117b43367629..58b9c419a2b6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -356,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -406,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -433,7 +434,8 @@ static struct btrfs_device *__alloc_device(void)
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -593,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -941,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -1035,28 +1037,21 @@ error:
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1094,10 +1089,22 @@ again:
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1149,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1197,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
if (fs_devices->open_devices == 0)
return -EINVAL;
@@ -1961,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device(
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -1999,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
@@ -2040,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2144,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2165,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
@@ -2221,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(fs_info, srcdev->bdev,
- srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2236,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2246,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2266,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2375,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2399,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2511,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2525,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2613,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_devices_dir(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2623,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2648,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2657,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2692,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2718,7 +2711,7 @@ error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4045,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4181,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4192,7 +4185,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4294,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4376,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -6461,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
- while (fs_devices) {
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6473,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6532,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6728,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6750,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6757,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7189,17 +7188,22 @@ error:
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7225,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7243,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7496,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7594,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5eea93916fbf..bf27ac07d315 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
@@ -246,7 +246,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/buffer.c b/fs/buffer.c
index 50bbc99e3d96..23f645657488 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
if (retry)
gfp |= __GFP_NOFAIL;
memcg = get_mem_cgroup_from_page(page);
- memalloc_use_memcg(memcg);
+ old_memcg = set_active_memcg(memcg);
head = NULL;
offset = PAGE_SIZE;
@@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return head;
/*
@@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
unlock_page(page);
return 0; /* don't care */
}
@@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6ea761c84494..35c83f65475b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
return !PagePrivate(page);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-static int ceph_sync_readpages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages,
- int page_align)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-
-/*
- * read a single page, without unlocking it.
- */
+/* read a single page, without unlocking it. */
static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ struct ceph_vino vino = ceph_vino(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
@@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
if (err == 0)
return -EINPROGRESS;
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_sync_readpages(fsc, ceph_vino(inode),
- &ci->i_layout, off, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
+ dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+ vino.ino, vino.snap, filp, off, len, page, page->index);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+
+ err = ceph_osdc_start_request(osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ dout("readpage result %d\n", err);
+
if (err == -ENOENT)
err = 0;
if (err < 0) {
- SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out;
}
if (err < PAGE_SIZE)
@@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
- if (rc == -EBLACKLISTED)
- ceph_inode_to_client(inode)->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ ceph_inode_to_client(inode)->blocklisted = true;
/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -620,50 +591,6 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * do a synchronous write on N pages
- */
-static int ceph_sync_writepages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-
-/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
+ struct inode *inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
- int err, len = PAGE_SIZE;
+ int err;
+ loff_t len = PAGE_SIZE;
struct ceph_writeback_ctl ceph_wbc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
dout("writepage %p idx %lu\n", page, page->index);
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
-
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
@@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_sync_writepages(fsc, ceph_vino(inode),
- &ci->i_layout, snapc, page_off, len,
- ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size,
- &inode->i_mtime, &page, 1);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
+ true);
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ return PTR_ERR(req);
+ }
+
+ /* it may be a short write due to an object boundary */
+ WARN_ON_ONCE(len > PAGE_SIZE);
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+ dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+
+ req->r_mtime = inode->i_mtime;
+ err = ceph_osdc_start_request(osdc, req, true);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ if (err == 0)
+ err = len;
+
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
@@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
mapping_set_error(&inode->i_data, err);
@@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
- if (rc == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
@@ -962,9 +910,8 @@ retry:
max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
- pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_DIRTY,
- max_pages - locked_pages);
+ pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY);
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
return ret;
}
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @page: page being dirtied
+ *
+ * We are only allowed to write into/dirty a page if the page is
+ * clean, or already dirty within the same snap context. Returns a
+ * conflicting context if there is one, NULL if there isn't, or a
+ * negative error code on other errors.
*
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * Must be called with page lock held.
*/
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct page *page)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = page->mapping->host;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_MASK;
- int pos_in_page = pos & ~PAGE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
- unlock_page(page);
- return -EIO;
+ return ERR_PTR(-EIO);
}
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
+ for (;;) {
+ struct ceph_snap_context *snapc, *oldest;
+
+ wait_on_page_writeback(page);
+
+ snapc = page_snap_context(page);
+ if (!snapc || snapc == ci->i_head_snapc)
+ break;
- snapc = page_snap_context(page);
- if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
+ /* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_killable(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
+ dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_unlock;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
- }
-
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = i_size_read(inode);
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_SIZE);
- return 0;
- }
-
- /* we need to read it. */
- r = ceph_do_readpage(file, page);
- if (r < 0) {
- if (r == -EINPROGRESS)
- return -EAGAIN;
- goto fail_unlock;
+ dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ if (clear_page_dirty_for_io(page)) {
+ int r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+ }
}
- goto retry_locked;
-fail_unlock:
- unlock_page(page);
- return r;
+ return NULL;
}
/*
@@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct page *page;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct page *page = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
- int r;
+ int pos_in_page = pos & ~PAGE_MASK;
+ int r = 0;
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
+ dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
+ for (;;) {
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page) {
+ r = -ENOMEM;
+ break;
+ }
- r = ceph_update_writeable_page(file, pos, len, page);
- if (r < 0)
+ snapc = ceph_find_incompatible(page);
+ if (snapc) {
+ if (IS_ERR(snapc)) {
+ r = PTR_ERR(snapc);
+ break;
+ }
+ unlock_page(page);
put_page(page);
- else
- *pagep = page;
- } while (r == -EAGAIN);
+ page = NULL;
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ if (r != 0)
+ break;
+ continue;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ break;
+ }
+
+ /*
+ * In some cases we don't need to read at all:
+ * - full page write
+ * - write that lies completely beyond EOF
+ * - write that covers the the page from start to EOF or beyond it
+ */
+ if ((pos_in_page == 0 && len == PAGE_SIZE) ||
+ (pos >= i_size_read(inode)) ||
+ (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
+ zero_user_segments(page, 0, pos_in_page,
+ pos_in_page + len, PAGE_SIZE);
+ break;
+ }
+ /*
+ * We need to read it. If we get back -EINPROGRESS, then the page was
+ * handed off to fscache and it will be unlocked when the read completes.
+ * Refind the page in that case so we can reacquire the page lock. Otherwise
+ * we got a hard error or the read was completed synchronously.
+ */
+ r = ceph_do_readpage(file, page);
+ if (r != -EINPROGRESS)
+ break;
+ }
+
+ if (r < 0) {
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else {
+ *pagep = page;
+ }
return r;
}
@@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_SHIFT;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
inode_inc_iversion_raw(inode);
do {
+ struct ceph_snap_context *snapc;
+
lock_page(page);
if (page_mkwrite_check_truncate(page, inode) < 0) {
@@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
break;
}
- err = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (err >= 0) {
+ snapc = ceph_find_incompatible(page);
+ if (!snapc) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
+ break;
}
- } while (err == -EAGAIN);
+
+ unlock_page(page);
+
+ if (IS_ERR(snapc)) {
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ ceph_queue_writeback(inode);
+ err = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ } while (err == 0);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM) {
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
- if (err2 == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err2 == -EBLOCKLISTED)
+ fsc->blocklisted = true;
err = err2;
goto out_unlock;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 034b3f4fdd3a..5027bbdca419 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1222,36 +1222,27 @@ struct cap_msg_args {
};
/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
+ * cap struct size + flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid
*/
-static int send_cap_msg(struct cap_msg_args *arg)
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
+
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
void *p;
- size_t extra_len;
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
- arg->mseq, arg->follows, arg->size, arg->max_size,
- arg->xattr_version,
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid */
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
- GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
msg->hdr.version = cpu_to_le16(10);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
-
- ceph_con_send(&arg->session->s_con, msg);
- return 0;
}
/*
@@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
*
* Caller should hold snap_rwsem (read), s_mutex.
*/
-static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
- struct ceph_inode_info *ci)
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
+ struct ceph_msg *msg;
struct inode *inode = &ci->vfs_inode;
- int ret;
- ret = send_cap_msg(arg);
- if (ret < 0) {
- pr_err("error sending cap msg, ino (%llx.%llx) "
- "flushing %s tid %llu, requeue\n",
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
ceph_vinop(inode), ceph_cap_string(arg->dirty),
arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
+ return;
}
+ encode_cap_msg(msg, arg);
+ ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
-
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
u32 mseq, u64 oldest_flush_tid)
{
struct cap_msg_args arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
arg.session = session;
arg.ino = ceph_vino(inode).ino;
@@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
arg.flags = 0;
arg.wake = false;
- return send_cap_msg(&arg);
+ encode_cap_msg(msg, &arg);
+ ceph_con_send(&arg.session->s_con, msg);
+ return 0;
}
/*
@@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
+ /* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
+
+ /* Caps which have active references against them */
used = __ceph_caps_used(ci);
+
+ /*
+ * "issued" represents the current caps that the MDS wants us to have.
+ * "implemented" is the set that we have been granted, and includes the
+ * ones that have not yet been returned to the MDS (the "revoking" set,
+ * usually because they have outstanding references).
+ */
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
want = file_wanted;
+
+ /* The ones we currently want to retain (may be adjusted below) */
retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
if (file_wanted) {
@@ -2011,6 +2017,10 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ /*
+ * If we have an auth cap, we don't need to consider any
+ * overlapping caps as used.
+ */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
@@ -2148,7 +2158,7 @@ ack:
want, retain, flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2222,7 +2232,7 @@ retry_locked:
flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_opened = false;
+ int i;
+
+ if (count == 1)
+ atomic64_inc(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
ci->i_nr_by_mode[i] += count;
+
+ /*
+ * If any of the mode ref is larger than 1,
+ * that means it has been already opened by
+ * others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i] > 1)
+ is_opened = true;
}
+
+ if (!is_opened)
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_closed = true;
+ int i;
+
+ if (count == 1)
+ atomic64_dec(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
BUG_ON(ci->i_nr_by_mode[i] < count);
ci->i_nr_by_mode[i] -= count;
}
+
+ /*
+ * If any of the mode ref is not 0 after
+ * decreased, that means it is still opened
+ * by others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ is_closed = false;
}
+
+ if (is_closed)
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3e3fcda9b276..7a8fbe3e4751 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
+ sum = percpu_counter_sum(&m->total_inodes);
+ seq_printf(s, "item total\n");
+ seq_printf(s, "------------------------------------------\n");
+ seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
+ atomic64_read(&m->opened_files), sum);
+ seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
+ atomic64_read(&m->total_caps), sum);
+ seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
+ percpu_counter_sum(&m->opened_inodes), sum);
+
+ seq_printf(s, "\n");
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
seq_printf(s, "-----------------------------------------------------------------------------------\n");
@@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
+ seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+ cap->session->s_mds,
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
"reserved\t%d\n"
"min\t\t%d\n\n",
total, avail, used, reserved, min);
- seq_printf(s, "ino issued implemented\n");
- seq_printf(s, "-----------------------------------------------\n");
+ seq_printf(s, "ino mds issued implemented\n");
+ seq_printf(s, "--------------------------------------------------\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d72e4a12bb69..a4d48370b2b3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
@@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int op;
int mask;
@@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
const char *dest)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -942,8 +939,7 @@ out:
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err = -EROFS;
@@ -1010,8 +1006,7 @@ out:
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int err;
@@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3f4c993dfc6f..209535d5b8d3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
- if (ret)
- return ret;
break;
case S_IFLNK:
@@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
*/
int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
@@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
size_t page_off;
u64 i_size;
bool more;
+ int idx;
+ size_t left;
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, off, &len, 0, 1,
@@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
more = len < iov_iter_count(to);
- if (unlikely(iov_iter_is_pipe(to))) {
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0) {
- ceph_osdc_put_request(req);
- ret = -ENOMEM;
- break;
- }
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
- if (ret < len) {
- len = ret;
- osd_req_op_extent_update(req, 0, len);
- more = false;
- }
- } else {
- num_pages = calc_pages_for(off, len);
- page_off = off & ~PAGE_MASK;
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages)) {
- ceph_osdc_put_request(req);
- ret = PTR_ERR(pages);
- break;
- }
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
@@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ret += zlen;
}
- if (unlikely(iov_iter_is_pipe(to))) {
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
- } else {
- iov_iter_advance(to, 0);
- }
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- int idx = 0;
- size_t left = ret > 0 ? ret : 0;
- while (left > 0) {
- size_t len, copied;
- page_off = off & ~PAGE_MASK;
- len = min_t(size_t, left, PAGE_SIZE - page_off);
- copied = copy_page_to_iter(pages[idx++],
- page_off, len, to);
- off += copied;
- left -= copied;
- if (copied < len) {
- ret = -EFAULT;
- break;
- }
+ idx = 0;
+ left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ SetPageUptodate(pages[idx]);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
+ break;
}
- ceph_release_page_vector(pages, num_pages);
}
+ ceph_release_page_vector(pages, num_pages);
if (ret < 0) {
- if (ret == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (ret == -EBLOCKLISTED)
+ fsc->blocklisted = true;
break;
}
@@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_client_metric *metric = &fsc->mdsc->metric;
+ struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d163fa96cb40..526faf4778ce 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
ci->i_vino = *(struct ceph_vino *)data;
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
+ percpu_counter_inc(&mdsc->metric.total_inodes);
+
return 0;
}
@@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_frag *frag;
struct rb_node *n;
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ percpu_counter_dec(&mdsc->metric.total_inodes);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
* caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
if (ceph_snap(inode) == CEPH_NOSNAP) {
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
@@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d6b9166e71e4..048a435a29be 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_request *req;
int err;
u64 length = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4a26862d7667..08f1c0c31dc2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3303,7 +3303,7 @@ bad:
}
static int __decode_session_metadata(void **p, void *end,
- bool *blacklisted)
+ bool *blocklisted)
{
/* map<string,string> */
u32 n;
@@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
+ /*
+ * Match "blocklisted (blacklisted)" from newer MDSes,
+ * or "blacklisted" from older MDSes.
+ */
if (err_str && strnstr(*p, "blacklisted", len))
- *blacklisted = true;
+ *blocklisted = true;
*p += len;
}
return 0;
@@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
u32 op;
u64 seq, features = 0;
int wake = 0;
- bool blacklisted = false;
+ bool blocklisted = false;
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
/* version >= 2, metadata */
- if (__decode_session_metadata(&p, end, &blacklisted) < 0)
+ if (__decode_session_metadata(&p, end, &blocklisted) < 0)
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
@@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
- if (blacklisted)
- mdsc->fsc->blacklisted = true;
+ if (blocklisted)
+ mdsc->fsc->blocklisted = true;
wake = 2; /* for good measure */
break;
@@ -3612,6 +3616,39 @@ fail_msg:
return err;
}
+static struct dentry* d_find_primary(struct inode *inode)
+{
+ struct dentry *alias, *dn = NULL;
+
+ if (hlist_empty(&inode->i_dentry))
+ return NULL;
+
+ spin_lock(&inode->i_lock);
+ if (hlist_empty(&inode->i_dentry))
+ goto out_unlock;
+
+ if (S_ISDIR(inode->i_mode)) {
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ if (!IS_ROOT(alias))
+ dn = dget(alias);
+ goto out_unlock;
+ }
+
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias) &&
+ (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+ dn = dget_dlock(alias);
+ }
+ spin_unlock(&alias->d_lock);
+ if (dn)
+ break;
+ }
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ return dn;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- int err;
+ struct dentry *dentry;
+ char *path;
+ int pathlen, err;
+ u64 pathbase;
u64 snap_follows;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
+ dentry = d_find_primary(inode);
+ if (dentry) {
+ /* set pathbase to parent dir when msg_version >= 2 */
+ path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+ recon_state->msg_version >= 2);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+ } else {
+ path = NULL;
+ pathlen = 0;
+ pathbase = 0;
+ }
+
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = 0;
+ rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = 0;
+ rec.v1.pathbase = cpu_to_le64(pathbase);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -3725,7 +3781,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -3749,7 +3805,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, NULL, 0);
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -3758,39 +3814,20 @@ encode_again:
out_freeflocks:
kfree(flocks);
} else {
- u64 pathbase = 0;
- int pathlen = 0;
- char *path = NULL;
- struct dentry *dentry;
-
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry,
- &pathlen, &pathbase, 0);
- dput(dentry);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_err;
- }
- rec.v1.pathbase = cpu_to_le64(pathbase);
- }
-
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
pathlen + sizeof(rec.v1));
- if (err) {
- goto out_freepath;
- }
+ if (err)
+ goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-out_freepath:
- ceph_mdsc_free_path(path, pathlen);
}
out_err:
- if (err >= 0)
+ ceph_mdsc_free_path(path, pathlen);
+ if (!err)
recon_state->nr_caps++;
return err;
}
@@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;
- if (!READ_ONCE(fsc->blacklisted))
+ if (!READ_ONCE(fsc->blocklisted))
return;
if (fsc->last_auto_reconnect &&
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
return;
- pr_info("auto reconnect after blacklisted\n");
+ pr_info("auto reconnect after blocklisted\n");
fsc->last_auto_reconnect = jiffies;
ceph_force_reconnect(fsc->sb);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 658800605bfb..cbf8af437140 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -393,7 +393,7 @@ struct ceph_mds_client {
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
- int max_sessions; /* len of s_mds_sessions */
+ int max_sessions; /* len of sessions array */
int stopping; /* true if shutting down */
atomic64_t quotarealms_count; /* # realms with quota */
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 2466b261fba2..fee4c4778313 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->total_metadatas = 0;
m->metadata_latency_sum = 0;
+ atomic64_set(&m->opened_files, 0);
+ ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_opened_inodes;
+ ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_total_inodes;
+
m->session = NULL;
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
return 0;
+err_total_inodes:
+ percpu_counter_destroy(&m->opened_inodes);
+err_opened_inodes:
+ percpu_counter_destroy(&m->i_caps_mis);
err_i_caps_mis:
percpu_counter_destroy(&m->i_caps_hit);
err_i_caps_hit:
@@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
if (!m)
return;
+ percpu_counter_destroy(&m->total_inodes);
+ percpu_counter_destroy(&m->opened_inodes);
percpu_counter_destroy(&m->i_caps_mis);
percpu_counter_destroy(&m->i_caps_hit);
percpu_counter_destroy(&m->d_lease_mis);
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 1d0959d669d7..710f3f1dceab 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -115,6 +115,13 @@ struct ceph_client_metric {
ktime_t metadata_latency_min;
ktime_t metadata_latency_max;
+ /* The total number of directories and files that are opened */
+ atomic64_t opened_files;
+
+ /* The total number of inodes that have opened files or directories */
+ struct percpu_counter opened_inodes;
+ struct percpu_counter total_inodes;
+
struct ceph_mds_session *session;
struct delayed_work delayed_work; /* delayed work */
};
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index cc2c4d40b022..83cb4f26b689 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -12,7 +12,7 @@
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
if (inc)
atomic64_inc(&mdsc->quotarealms_count);
else
@@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct super_block *sb = mdsc->fsc->sb;
+ struct super_block *sb = inode->i_sb;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
@@ -266,7 +266,7 @@ restart:
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
@@ -313,7 +313,7 @@ enum quota_check_op {
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 923be9399b21..0da39c16dab4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7ec0e6d03d10..2516304379d3 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1205,14 +1205,13 @@ nomem:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
- generic_shutdown_super(s);
+ kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
- free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
@@ -1243,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb)
* see remove_session_caps_cb() */
flush_workqueue(fsc->inode_wq);
- /* In case that we were blacklisted. This also reset
+ /* In case that we were blocklisted. This also reset
* all mon/osd connections */
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
- fsc->blacklisted = false;
+ fsc->blocklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a3995ebe0623..482473e4cce1 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -32,7 +32,7 @@
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
+#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@@ -109,7 +109,7 @@ struct ceph_fs_client {
unsigned long mount_state;
unsigned long last_auto_reconnect;
- bool blacklisted;
+ bool blocklisted;
bool have_copy_from2;
@@ -160,7 +160,8 @@ struct ceph_cap {
int issued; /* latest, from the mds */
int implemented; /* implemented superset of
issued (for revocation) */
- int mds, mds_wanted;
+ int mds; /* mds index for this cap */
+ int mds_wanted; /* caps wanted from this mds */
};
/* caps to release */
struct {
@@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
return (struct ceph_fs_client *)sb->s_fs_info;
}
+static inline struct ceph_mds_client *
+ceph_sb_to_mdsc(const struct super_block *sb)
+{
+ return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+}
+
static inline struct ceph_vino
ceph_vino(const struct inode *inode)
{
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 3a733ac33d9b..197cb1234341 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
* NULL terminates however, so call it on a temporary buffer and then memcpy
* the result into place.
*/
-static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
+static __printf(3, 4)
+int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
int ret;
va_list args;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32f90dc82c84..d44df8f95bcd 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = si_iov;
rqst[1].rq_nvec = 1;
- len = sizeof(ea) + ea_name_len + ea_value_len + 1;
+ len = sizeof(*ea) + ea_name_len + ea_value_len + 1;
ea = kzalloc(len, GFP_KERNEL);
if (ea == NULL) {
rc = -ENOMEM;
diff --git a/fs/compat.c b/fs/compat.c
deleted file mode 100644
index 436d228cf71c..000000000000
--- a/fs/compat.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/compat.c
- *
- * Kernel compatibililty routines for e.g. 32 bit syscall support
- * on 64 bit kernels.
- *
- * Copyright (C) 2002 Stephen Rothwell, IBM Corporation
- * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
- * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
- * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
- */
-
-#include <linux/compat.h>
-#include <linux/nfs4_mount.h>
-#include <linux/syscalls.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "internal.h"
-
-struct compat_nfs_string {
- compat_uint_t len;
- compat_uptr_t data;
-};
-
-static inline void compat_nfs_string(struct nfs_string *dst,
- struct compat_nfs_string *src)
-{
- dst->data = compat_ptr(src->data);
- dst->len = src->len;
-}
-
-struct compat_nfs4_mount_data_v1 {
- compat_int_t version;
- compat_int_t flags;
- compat_int_t rsize;
- compat_int_t wsize;
- compat_int_t timeo;
- compat_int_t retrans;
- compat_int_t acregmin;
- compat_int_t acregmax;
- compat_int_t acdirmin;
- compat_int_t acdirmax;
- struct compat_nfs_string client_addr;
- struct compat_nfs_string mnt_path;
- struct compat_nfs_string hostname;
- compat_uint_t host_addrlen;
- compat_uptr_t host_addr;
- compat_int_t proto;
- compat_int_t auth_flavourlen;
- compat_uptr_t auth_flavours;
-};
-
-static int do_nfs4_super_data_conv(void *raw_data)
-{
- int version = *(compat_uint_t *) raw_data;
-
- if (version == 1) {
- struct compat_nfs4_mount_data_v1 *raw = raw_data;
- struct nfs4_mount_data *real = raw_data;
-
- /* copy the fields backwards */
- real->auth_flavours = compat_ptr(raw->auth_flavours);
- real->auth_flavourlen = raw->auth_flavourlen;
- real->proto = raw->proto;
- real->host_addr = compat_ptr(raw->host_addr);
- real->host_addrlen = raw->host_addrlen;
- compat_nfs_string(&real->hostname, &raw->hostname);
- compat_nfs_string(&real->mnt_path, &raw->mnt_path);
- compat_nfs_string(&real->client_addr, &raw->client_addr);
- real->acdirmax = raw->acdirmax;
- real->acdirmin = raw->acdirmin;
- real->acregmax = raw->acregmax;
- real->acregmin = raw->acregmin;
- real->retrans = raw->retrans;
- real->timeo = raw->timeo;
- real->wsize = raw->wsize;
- real->rsize = raw->rsize;
- real->flags = raw->flags;
- real->version = raw->version;
- }
-
- return 0;
-}
-
-#define NFS4_NAME "nfs4"
-
-COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
- const char __user *, dir_name,
- const char __user *, type, compat_ulong_t, flags,
- const void __user *, data)
-{
- char *kernel_type;
- void *options;
- char *kernel_dev;
- int retval;
-
- kernel_type = copy_mount_string(type);
- retval = PTR_ERR(kernel_type);
- if (IS_ERR(kernel_type))
- goto out;
-
- kernel_dev = copy_mount_string(dev_name);
- retval = PTR_ERR(kernel_dev);
- if (IS_ERR(kernel_dev))
- goto out1;
-
- options = copy_mount_options(data);
- retval = PTR_ERR(options);
- if (IS_ERR(options))
- goto out2;
-
- if (kernel_type && options) {
- if (!strcmp(kernel_type, NFS4_NAME)) {
- retval = -EINVAL;
- if (do_nfs4_super_data_conv(options))
- goto out3;
- }
- }
-
- retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
-
- out3:
- kfree(options);
- out2:
- kfree(kernel_dev);
- out1:
- kfree(kernel_type);
- out:
- return retval;
-}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ca2273727225..b0983e2a4e2c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item);
/*
* Release the dependent linkage. This is much simpler than
- * configfs_depend_item() because we know that that the client driver is
+ * configfs_depend_item() because we know that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
void configfs_undepend_item(struct config_item *target)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index fb65b706cc0d..1f0270229d7b 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou
* There is no easy way for us to know if userspace is only doing a partial
* write, so we don't support them. We expect the entire buffer to come
* on the first write.
- * Hint: if you're writing a value, first read the file, modify only the
+ * Hint: if you're writing a value, first read the file, modify only
* the value you're changing, then write entire buffer back.
*/
diff --git a/fs/coredump.c b/fs/coredump.c
index 76e7c10edfc0..0cd9056d79cc 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
ssize_t n;
if (cprm->written + nr > cprm->limit)
return 0;
- while (nr) {
- if (dump_interrupted())
- return 0;
- n = __kernel_write(file, addr, nr, &pos);
- if (n <= 0)
- return 0;
- file->f_pos = pos;
- cprm->written += n;
- cprm->pos += n;
- nr -= n;
- }
+
+
+ if (dump_interrupted())
+ return 0;
+ n = __kernel_write(file, addr, nr, &pos);
+ if (n != nr)
+ return 0;
+ file->f_pos = pos;
+ cprm->written += n;
+ cprm->pos += n;
+
return 1;
}
EXPORT_SYMBOL(dump_emit);
@@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
}
EXPORT_SYMBOL(dump_skip);
+#ifdef CONFIG_ELF_CORE
+int dump_user_range(struct coredump_params *cprm, unsigned long start,
+ unsigned long len)
+{
+ unsigned long addr;
+
+ for (addr = start; addr < start + len; addr += PAGE_SIZE) {
+ struct page *page;
+ int stop;
+
+ /*
+ * To avoid having to allocate page tables for virtual address
+ * ranges that have never been used yet, and also to make it
+ * easy to generate sparse core files, use a helper that returns
+ * NULL when encountering an empty page table entry that would
+ * otherwise have been filled with the zero page.
+ */
+ page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+
+ stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+ } else {
+ stop = !dump_skip(cprm, PAGE_SIZE);
+ }
+ if (stop)
+ return 0;
+ }
+ return 1;
+}
+#endif
+
int dump_align(struct coredump_params *cprm, int align)
{
unsigned mod = cprm->pos & (align - 1);
@@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm)
}
}
EXPORT_SYMBOL(dump_truncate);
+
+/*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+ /* Any vsyscall mappings? */
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+
+ /*
+ * Assume that all vmas with a .name op should always be dumped.
+ * If this changes, a new vm_ops field can easily be added.
+ */
+ if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+ return true;
+
+ /*
+ * arch_vma_name() returns non-NULL for special architecture mappings,
+ * such as vDSO sections.
+ */
+ if (arch_vma_name(vma))
+ return true;
+
+ return false;
+}
+
+/*
+ * Decide how much of @vma's contents should be included in a core dump.
+ */
+static unsigned long vma_dump_size(struct vm_area_struct *vma,
+ unsigned long mm_flags)
+{
+#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
+
+ /* always dump the vdso and vsyscall sections */
+ if (always_dump_vma(vma))
+ goto whole;
+
+ if (vma->vm_flags & VM_DONTDUMP)
+ return 0;
+
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
+ /* Hugetlb memory check */
+ if (is_vm_hugetlb_page(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
+ /* Do not dump I/O mapped devices or special mappings */
+ if (vma->vm_flags & VM_IO)
+ return 0;
+
+ /* By default, dump shared memory if mapped from an anonymous file. */
+ if (vma->vm_flags & VM_SHARED) {
+ if (file_inode(vma->vm_file)->i_nlink == 0 ?
+ FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
+ goto whole;
+ return 0;
+ }
+
+ /* Dump segments that have been written to. */
+ if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
+ goto whole;
+ if (vma->vm_file == NULL)
+ return 0;
+
+ if (FILTER(MAPPED_PRIVATE))
+ goto whole;
+
+ /*
+ * If this is the beginning of an executable file mapping,
+ * dump the first page to aid in determining what was mapped here.
+ */
+ if (FILTER(ELF_HEADERS) &&
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
+ (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+#undef FILTER
+
+ return 0;
+
+whole:
+ return vma->vm_end - vma->vm_start;
+}
+
+static struct vm_area_struct *first_vma(struct task_struct *tsk,
+ struct vm_area_struct *gate_vma)
+{
+ struct vm_area_struct *ret = tsk->mm->mmap;
+
+ if (ret)
+ return ret;
+ return gate_vma;
+}
+
+/*
+ * Helper function for iterating across a vma list. It ensures that the caller
+ * will visit `gate_vma' prior to terminating the search.
+ */
+static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+ struct vm_area_struct *gate_vma)
+{
+ struct vm_area_struct *ret;
+
+ ret = this_vma->vm_next;
+ if (ret)
+ return ret;
+ if (this_vma == gate_vma)
+ return NULL;
+ return gate_vma;
+}
+
+/*
+ * Under the mmap_lock, take a snapshot of relevant information about the task's
+ * VMAs.
+ */
+int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
+ struct core_vma_metadata **vma_meta,
+ size_t *vma_data_size_ptr)
+{
+ struct vm_area_struct *vma, *gate_vma;
+ struct mm_struct *mm = current->mm;
+ int i;
+ size_t vma_data_size = 0;
+
+ /*
+ * Once the stack expansion code is fixed to not change VMA bounds
+ * under mmap_lock in read mode, this can be changed to take the
+ * mmap_lock in read mode.
+ */
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ gate_vma = get_gate_vma(mm);
+ *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+
+ *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
+ if (!*vma_meta) {
+ mmap_write_unlock(mm);
+ return -ENOMEM;
+ }
+
+ for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ vma = next_vma(vma, gate_vma), i++) {
+ struct core_vma_metadata *m = (*vma_meta) + i;
+
+ m->start = vma->vm_start;
+ m->end = vma->vm_end;
+ m->flags = vma->vm_flags;
+ m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+
+ vma_data_size += m->dump_size;
+ }
+
+ mmap_write_unlock(mm);
+
+ if (WARN_ON(i != *vma_count))
+ return -EFAULT;
+
+ *vma_data_size_ptr = vma_data_size;
+ return 0;
+}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 9212325763b0..4ef3f714046a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- if (inode)
+ if (inode && inode->i_ino)
printk("%sfscrypt (%s, inode %lu): %pV\n",
level, inode->i_sb->s_id, inode->i_ino, &vaf);
+ else if (inode)
+ printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf);
else
printk("%sfscrypt: %pV\n", level, &vaf);
va_end(args);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 011830f84d8d..1fbe6c24d705 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,15 +61,6 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
-{
- struct sha256_state sctx;
-
- sha256_init(&sctx);
- sha256_update(&sctx, data, data_len);
- sha256_final(&sctx, result);
-}
-
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst)
return cp - dst;
}
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret)
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
- const struct fscrypt_info *ci = inode->i_crypt_info;
- int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+ int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
@@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
- * @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
* @crypto_str: (output) buffer to allocate
@@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_alloc_buffer(const struct inode *inode,
- u32 max_encrypted_len,
+int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
@@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
+ sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
* get the disk_name.
*
- * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
- * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
- * impossible in this case, so we fail them with ENOKEY.
+ * Else, for keyless @lookup operations, @iname should be a no-key name, so we
+ * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
+ * be impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
*
@@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(dir, iname->len,
+ if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
+ iname->len,
dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
@@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
}
if (!lookup)
return -ENOKEY;
- fname->is_ciphertext_name = true;
+ fname->is_nokey_name = true;
/*
* We don't have the key and we are doing a lookup; decode the
@@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
{
const struct fscrypt_nokey_name *nokey_name =
(const void *)fname->crypto_buf.name;
- u8 sha256[SHA256_DIGEST_SIZE];
+ u8 digest[SHA256_DIGEST_SIZE];
if (likely(fname->disk_name.name)) {
if (de_name_len != fname->disk_name.len)
@@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256);
- return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
+ sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), digest);
+ return !memcmp(digest, nokey_name->sha256, sizeof(digest));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
@@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
* Validate dentries in encrypted directories to make sure we aren't potentially
* caching stale dentries after a key has been added.
*/
-static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *dir;
int err;
@@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* Plaintext names are always valid, since fscrypt doesn't support
- * reverting to ciphertext names without evicting the directory's inode
+ * reverting to no-key names without evicting the directory's inode
* -- which implies eviction of the dentries in the directory.
*/
- if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+ if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
return 1;
/*
- * Ciphertext name; valid if the directory's key is still unavailable.
+ * No-key name; valid if the directory's key is still unavailable.
*
- * Although fscrypt forbids rename() on ciphertext names, we still must
- * use dget_parent() here rather than use ->d_parent directly. That's
+ * Although fscrypt forbids rename() on no-key names, we still must use
+ * dget_parent() here rather than use ->d_parent directly. That's
* because a corrupted fs image may contain directory hard links, which
* the VFS handles by moving the directory's dentry tree in the dcache
* each time ->lookup() finds the directory and it already has a dentry
@@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
+EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8117a61b6f55..4f5806a3b73d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
return NULL;
}
-#undef fscrypt_policy
union fscrypt_policy {
u8 version;
struct fscrypt_policy_v1 v1;
@@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret);
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk);
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
+
/* keysetup_v1.c */
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
@@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
const union fscrypt_context *ctx_u,
int ctx_size);
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 09fb8aa0f2e9..20b0df47fe6a 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
if (err)
return err;
- /* ... in case we looked up ciphertext name before key was added */
- if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name before key was added */
+ if (dentry->d_flags & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (!fscrypt_has_permitted_context(dir, inode))
@@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
return err;
- /* ... in case we looked up ciphertext name(s) before key was added */
- if ((old_dentry->d_flags | new_dentry->d_flags) &
- DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name(s) before key was added */
+ if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (old_dir != new_dir) {
@@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_ciphertext_name) {
+ if (fname->is_nokey_name) {
spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
+ dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
d_set_d_op(dentry, &fscrypt_d_ops);
}
@@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode,
return 0;
}
-int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
- unsigned int max_len,
- struct fscrypt_str *disk_link)
+/**
+ * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len. An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted. For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later. (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+int fscrypt_prepare_symlink(struct inode *dir, const char *target,
+ unsigned int len, unsigned int max_len,
+ struct fscrypt_str *disk_link)
{
- int err;
+ const union fscrypt_policy *policy;
/*
* To calculate the size of the encrypted symlink target we need to know
* the amount of NUL padding, which is determined by the flags set in
* the encryption policy which will be inherited from the directory.
- * The easiest way to get access to this is to just load the directory's
- * fscrypt_info, since we'll need it to create the dir_entry anyway.
- *
- * Note: in test_dummy_encryption mode, @dir may be unencrypted.
*/
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL) {
+ /* Not encrypted */
+ disk_link->name = (unsigned char *)target;
+ disk_link->len = len + 1;
+ if (disk_link->len > max_len)
+ return -ENAMETOOLONG;
+ return 0;
+ }
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
/*
* Calculate the size of the encrypted symlink and verify it won't
@@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(dir, len,
+ if (!fscrypt_fname_encrypted_size(policy, len,
max_len - sizeof(struct fscrypt_symlink_data),
&disk_link->len))
return -ENAMETOOLONG;
@@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
disk_link->name = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
unsigned int len, struct fscrypt_str *disk_link)
@@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
struct fscrypt_symlink_data *sd;
unsigned int ciphertext_len;
- err = fscrypt_require_key(inode);
- if (err)
- return err;
+ /*
+ * fscrypt_prepare_new_inode() should have already set up the new
+ * symlink inode's encryption key. We don't wait until now to do it,
+ * since we may be in a filesystem transaction now.
+ */
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
+ return -ENOKEY;
if (disk_link->name) {
/* filesystem-provided buffer */
@@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
if (cstr.len + sizeof(*sd) - 1 > max_size)
return ERR_PTR(-EUCLEAN);
- err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
if (err)
return ERR_PTR(err);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index faa25541ccb6..89bffa82ed74 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
if (!devs)
return -ENOMEM;
fscrypt_get_devices(sb, num_devs, devs);
@@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
struct fscrypt_blk_crypto_key *blk_key;
int err;
int i;
- unsigned int flags;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
@@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
queue_refs++;
- flags = memalloc_nofs_save();
err = blk_crypto_start_using_key(&blk_key->base,
blk_key->devs[i]);
- memalloc_nofs_restore(flags);
if (err) {
fscrypt_err(inode,
"error %d starting to use blk-crypto", err);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e74f239c4428..53cc552a7b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb,
struct list_head *pos;
size_t busy_count = 0;
unsigned long ino;
+ char ino_str[50] = "";
spin_lock(&mk->mk_decrypted_inodes_lock);
@@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb,
}
spin_unlock(&mk->mk_decrypted_inodes_lock);
+ /* If the inode is currently being created, ino may still be 0. */
+ if (ino)
+ snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino);
+
fscrypt_warn(NULL,
- "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu",
+ "%s: %zu inode(s) still busy after removing key with %s %*phN%s",
sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
- ino);
+ ino_str);
return -EBUSY;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fea6226afc2b..d3c3e5d9b41f 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -10,6 +10,7 @@
#include <crypto/skcipher.h>
#include <linux/key.h>
+#include <linux/random.h>
#include "fscrypt_private.h"
@@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk)
+{
+ WARN_ON(ci->ci_inode->i_ino == 0);
+ WARN_ON(!mk->mk_ino_hash_key_initialized);
+
+ ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
+ &mk->mk_ino_hash_key);
+}
+
static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
@@ -254,13 +265,20 @@ unlock:
return err;
}
- ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
- &mk->mk_ino_hash_key);
+ /*
+ * New inodes may not have an inode number assigned yet.
+ * Hashing their inode number is delayed until later.
+ */
+ if (ci->ci_inode->i_ino == 0)
+ WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
+ else
+ fscrypt_hash_inode_number(ci, mk);
return 0;
}
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk)
+ struct fscrypt_master_key *mk,
+ bool need_dirhash_key)
{
int err;
@@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) {
+ if (need_dirhash_key) {
err = fscrypt_derive_dirhash_key(ci, mk);
if (err)
return err;
@@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
+ bool need_dirhash_key,
struct key **master_key_ret)
{
struct key *key;
@@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
break;
case FSCRYPT_POLICY_V2:
- err = fscrypt_setup_v2_file_key(ci, mk);
+ err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
break;
default:
WARN_ON(1);
@@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int fscrypt_get_encryption_info(struct inode *inode)
+static int
+fscrypt_setup_encryption_info(struct inode *inode,
+ const union fscrypt_policy *policy,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
+ bool need_dirhash_key)
{
struct fscrypt_info *crypt_info;
- union fscrypt_context ctx;
struct fscrypt_mode *mode;
struct key *master_key = NULL;
int res;
- if (fscrypt_has_encryption_key(inode))
- return 0;
-
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0) {
- const union fscrypt_context *dummy_ctx =
- fscrypt_get_dummy_context(inode->i_sb);
-
- if (IS_ENCRYPTED(inode) || !dummy_ctx) {
- fscrypt_warn(inode,
- "Error %d getting encryption context",
- res);
- return res;
- }
- /* Fake up a context for an unencrypted directory */
- res = fscrypt_context_size(dummy_ctx);
- memcpy(&ctx, dummy_ctx, res);
- }
-
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_inode = inode;
-
- res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
- if (res) {
- fscrypt_warn(inode,
- "Unrecognized or corrupt encryption context");
- goto out;
- }
-
- memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FSCRYPT_FILE_NONCE_SIZE);
-
- if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
- res = -EINVAL;
- goto out;
- }
+ crypt_info->ci_policy = *policy;
+ memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
mode = select_encryption_mode(&crypt_info->ci_policy, inode);
if (IS_ERR(mode)) {
@@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode)
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key,
+ &master_key);
if (res)
goto out;
/*
- * Multiple tasks may race to set ->i_crypt_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * For existing inodes, multiple tasks may race to set ->i_crypt_info.
+ * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
* fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
* RELEASE barrier so that other tasks can ACQUIRE it.
*/
@@ -550,14 +541,113 @@ out:
up_read(&mk->mk_secret_sem);
key_put(master_key);
}
+ put_crypt_info(crypt_info);
+ return res;
+}
+
+/**
+ * fscrypt_get_encryption_info() - set up an inode's encryption key
+ * @inode: the inode to set up the key for. Must be encrypted.
+ *
+ * Set up ->i_crypt_info, if it hasn't already been done.
+ *
+ * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * generally this shouldn't be called from within a filesystem transaction.
+ *
+ * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
+ * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * distinguish these cases.) Also can return another -errno code.
+ */
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ int res;
+ union fscrypt_context ctx;
+ union fscrypt_policy policy;
+
+ if (fscrypt_has_encryption_key(inode))
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ fscrypt_warn(inode, "Error %d getting encryption context", res);
+ return res;
+ }
+
+ res = fscrypt_policy_from_context(&policy, &ctx, res);
+ if (res) {
+ fscrypt_warn(inode,
+ "Unrecognized or corrupt encryption context");
+ return res;
+ }
+
+ if (!fscrypt_supported_policy(&policy, inode))
+ return -EINVAL;
+
+ res = fscrypt_setup_encryption_info(inode, &policy,
+ fscrypt_context_nonce(&ctx),
+ IS_CASEFOLDED(inode) &&
+ S_ISDIR(inode->i_mode));
if (res == -ENOKEY)
res = 0;
- put_crypt_info(crypt_info);
return res;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
+ * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
+ * @dir: a possibly-encrypted directory
+ * @inode: the new inode. ->i_mode must be set already.
+ * ->i_ino doesn't need to be set yet.
+ * @encrypt_ret: (output) set to %true if the new inode will be encrypted
+ *
+ * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * encrypting the name of the new file. Also, if the new inode will be
+ * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ *
+ * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
+ * any filesystem transaction to create the inode. For this reason, ->i_ino
+ * isn't required to be set yet, as the filesystem may not have set it yet.
+ *
+ * This doesn't persist the new inode's encryption context. That still needs to
+ * be done later by calling fscrypt_set_context().
+ *
+ * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
+ * -errno code
+ */
+int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
+ bool *encrypt_ret)
+{
+ const union fscrypt_policy *policy;
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL)
+ return 0;
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
+
+ if (WARN_ON_ONCE(inode->i_mode == 0))
+ return -EINVAL;
+
+ /*
+ * Only regular files, directories, and symlinks are encrypted.
+ * Special files like device nodes and named pipes aren't.
+ */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return 0;
+
+ *encrypt_ret = true;
+
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ return fscrypt_setup_encryption_info(inode, policy, nonce,
+ IS_CASEFOLDED(dir) &&
+ S_ISDIR(inode->i_mode));
+}
+EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
+
+/**
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index a3cb52572b05..2762c5350432 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key,
goto out;
}
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
if (!req) {
res = -ENOMEM;
goto out;
@@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix,
const struct user_key_payload *ukp;
const struct fscrypt_key *payload;
- description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ description = kasprintf(GFP_KERNEL, "%s%*phN", prefix,
FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
return ERR_PTR(-ENOMEM);
@@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return dk;
/* Nope, allocate one. */
- dk = kzalloc(sizeof(*dk), GFP_NOFS);
+ dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
@@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
* This cannot be a stack buffer because it will be passed to the
* scatterlist crypto API during derive_key_aes().
*/
- derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL);
if (!derived_key)
return -ENOMEM;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 2d73fd39ad96..4441d9944b9e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static const union fscrypt_policy *
+fscrypt_get_dummy_policy(struct super_block *sb)
+{
+ if (!sb->s_cop->get_dummy_policy)
+ return NULL;
+ return sb->s_cop->get_dummy_policy(sb);
+}
+
static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
32, 32))
return false;
+ /*
+ * IV_INO_LBLK_32 hashes the inode number, so in principle it can
+ * support any ino_bits. However, currently the inode number is gotten
+ * from inode::i_ino which is 'unsigned long'. So for now the
+ * implementation limit is 32 bits.
+ */
if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- /* This uses hashed inode numbers, so ino_bits doesn't matter. */
!supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- INT_MAX, 32))
+ 32, 32))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
}
/**
- * fscrypt_new_context_from_policy() - create a new fscrypt_context from
- * an fscrypt_policy
+ * fscrypt_new_context() - create a new fscrypt_context
* @ctx_u: output context
* @policy_u: input policy
+ * @nonce: nonce to use
*
* Create an fscrypt_context for an inode that is being assigned the given
- * encryption policy. A new nonce is randomly generated.
+ * encryption policy. @nonce must be a new random nonce.
*
* Return: the size of the new context in bytes.
*/
-static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
- const union fscrypt_policy *policy_u)
+static int fscrypt_new_context(union fscrypt_context *ctx_u,
+ const union fscrypt_policy *policy_u,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE])
{
memset(ctx_u, 0, sizeof(*ctx_u));
@@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_descriptor,
policy->master_key_descriptor,
sizeof(ctx->master_key_descriptor));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
case FSCRYPT_POLICY_V2: {
@@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
}
@@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
static int set_encryption_policy(struct inode *inode,
const union fscrypt_policy *policy)
{
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
union fscrypt_context ctx;
int ctxsize;
int err;
@@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode,
return -EINVAL;
}
- ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ ctxsize = fscrypt_new_context(&ctx, policy, nonce);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}
@@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
+/*
+ * Return the encryption policy that new files in the directory will inherit, or
+ * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also
+ * ensure that its key is set up, so that the new filename can be encrypted.
+ */
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
+{
+ int err;
+
+ if (IS_ENCRYPTED(dir)) {
+ err = fscrypt_require_key(dir);
+ if (err)
+ return ERR_PTR(err);
+ return &dir->i_crypt_info->ci_policy;
+ }
+
+ return fscrypt_get_dummy_policy(dir->i_sb);
+}
+
/**
- * fscrypt_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- * @fs_data: private data given by FS.
- * @preload: preload child i_crypt_info if true
+ * fscrypt_set_context() - Set the fscrypt context of a new inode
+ * @inode: a new inode
+ * @fs_data: private data given by FS and passed to ->set_context()
+ *
+ * This should be called after fscrypt_prepare_new_inode(), generally during a
+ * filesystem transaction. Everything here must be %GFP_NOFS-safe.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_inherit_context(struct inode *parent, struct inode *child,
- void *fs_data, bool preload)
+int fscrypt_set_context(struct inode *inode, void *fs_data)
{
+ struct fscrypt_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
- struct fscrypt_info *ci;
- int res;
-
- res = fscrypt_get_encryption_info(parent);
- if (res < 0)
- return res;
- ci = fscrypt_get_info(parent);
- if (ci == NULL)
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
return -ENOKEY;
- ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
-
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
- if (res)
- return res;
- return preload ? fscrypt_get_encryption_info(child): 0;
+ ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+
+ /*
+ * This may be the first time the inode number is available, so do any
+ * delayed key setup that requires the inode number.
+ */
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ const struct fscrypt_master_key *mk =
+ ci->ci_master_key->payload.data[0];
+
+ fscrypt_hash_inode_number(ci, mk);
+ }
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
-EXPORT_SYMBOL(fscrypt_inherit_context);
+EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
* fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
* @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option.
- * If no argument was specified, then @arg->from == NULL.
- * @dummy_ctx: the filesystem's current dummy context (input/output, see below)
+ * @arg: the argument to the test_dummy_encryption option. May be NULL.
+ * @dummy_policy: the filesystem's current dummy policy (input/output, see
+ * below)
*
* Handle the test_dummy_encryption mount option by creating a dummy encryption
- * context, saving it in @dummy_ctx, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_ctx is already set, then
+ * policy, saving it in @dummy_policy, and adding the corresponding dummy
+ * encryption key to the filesystem. If the @dummy_policy is already set, then
* instead validate that it matches @arg. Don't support changing it via
* remount, as that is difficult to do safely.
*
- * The reason we use an fscrypt_context rather than an fscrypt_policy is because
- * we mustn't generate a new nonce each time we access a dummy-encrypted
- * directory, as that would change the way filenames are encrypted.
- *
- * Return: 0 on success (dummy context set, or the same context is already set);
- * -EEXIST if a different dummy context is already set;
+ * Return: 0 on success (dummy policy set, or the same policy is already set);
+ * -EEXIST if a different dummy policy is already set;
* or another -errno value.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb,
- const substring_t *arg,
- struct fscrypt_dummy_context *dummy_ctx)
+int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- const char *argstr = "v2";
- const char *argstr_to_free = NULL;
struct fscrypt_key_specifier key_spec = { 0 };
int version;
- union fscrypt_context *ctx = NULL;
+ union fscrypt_policy *policy = NULL;
int err;
- if (arg->from) {
- argstr = argstr_to_free = match_strdup(arg);
- if (!argstr)
- return -ENOMEM;
- }
+ if (!arg)
+ arg = "v2";
- if (!strcmp(argstr, "v1")) {
- version = FSCRYPT_CONTEXT_V1;
+ if (!strcmp(arg, "v1")) {
+ version = FSCRYPT_POLICY_V1;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
memset(key_spec.u.descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(argstr, "v2")) {
- version = FSCRYPT_CONTEXT_V2;
+ } else if (!strcmp(arg, "v2")) {
+ version = FSCRYPT_POLICY_V2;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
/* key_spec.u.identifier gets filled in when adding the key */
} else {
@@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
goto out;
}
- if (dummy_ctx->ctx) {
- /*
- * Note: if we ever make test_dummy_encryption support
- * specifying other encryption settings, such as the encryption
- * modes, we'll need to compare those settings here.
- */
- if (dummy_ctx->ctx->version == version)
- err = 0;
- else
- err = -EEXIST;
- goto out;
- }
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
+ policy = kzalloc(sizeof(*policy), GFP_KERNEL);
+ if (!policy) {
err = -ENOMEM;
goto out;
}
@@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
if (err)
goto out;
- ctx->version = version;
- switch (ctx->version) {
- case FSCRYPT_CONTEXT_V1:
- ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor,
+ policy->version = version;
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
break;
- case FSCRYPT_CONTEXT_V2:
- ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier,
+ case FSCRYPT_POLICY_V2:
+ policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
break;
default:
@@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
err = -EINVAL;
goto out;
}
- dummy_ctx->ctx = ctx;
- ctx = NULL;
+
+ if (dummy_policy->policy) {
+ if (fscrypt_policies_equal(policy, dummy_policy->policy))
+ err = 0;
+ else
+ err = -EEXIST;
+ goto out;
+ }
+ dummy_policy->policy = policy;
+ policy = NULL;
err = 0;
out:
- kfree(ctx);
- kfree(argstr_to_free);
+ kfree(policy);
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
@@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
struct super_block *sb)
{
- const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb);
+ const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb);
+ int vers;
- if (!ctx)
+ if (!policy)
return;
- seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version);
+
+ vers = policy->version;
+ if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */
+ vers = 1;
+
+ seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers);
}
EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
diff --git a/fs/d_path.c b/fs/d_path.c
index 0f1fc1743302..a69e2cd36e6e 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -102,6 +102,8 @@ restart:
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
struct mount *parent = READ_ONCE(mnt->mnt_parent);
+ struct mnt_namespace *mnt_ns;
+
/* Escaped? */
if (dentry != vfsmnt->mnt_root) {
bptr = *buffer;
@@ -116,7 +118,9 @@ restart:
vfsmnt = &mnt->mnt;
continue;
}
- if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns))
+ mnt_ns = READ_ONCE(mnt->mnt_ns);
+ /* open-coded is_mounted() to use local mnt_ns */
+ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
error = 1; // absolute root
else
error = 2; // detached or not attached yet
diff --git a/fs/dax.c b/fs/dax.c
index 994ab66a9907..5b47834f2e1b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -559,8 +559,11 @@ fallback:
}
/**
- * dax_layout_busy_page - find first pinned page in @mapping
+ * dax_layout_busy_page_range - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ * pages from 'start' till the end of file are included.
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
@@ -573,12 +576,15 @@ fallback:
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_layout_busy_page_range(struct address_space *mapping,
+ loff_t start, loff_t end)
{
- XA_STATE(xas, &mapping->i_pages, 0);
void *entry;
unsigned int scanned = 0;
struct page *page = NULL;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the iteration and wait, or
@@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
- * holding those locks, and unmap_mapping_range() will not zero the
+ * holding those locks, and unmap_mapping_pages() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
- unmap_mapping_range(mapping, 0, 0, 0);
+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
xas_lock_irq(&xas);
- xas_for_each(&xas, entry, ULONG_MAX) {
+ xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
if (unlikely(dax_is_locked(entry)))
@@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
xas_unlock_irq(&xas);
return page;
}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+ return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
@@ -1037,18 +1054,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
- struct iomap *iomap)
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff;
long rc, id;
void *kaddr;
bool page_aligned = false;
-
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- IS_ALIGNED(size, PAGE_SIZE))
+ (size == PAGE_SIZE))
page_aligned = true;
rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
@@ -1058,8 +1075,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
id = dax_read_lock();
if (page_aligned)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff,
- size >> PAGE_SHIFT);
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
@@ -1072,7 +1088,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
dax_flush(iomap->dax_dev, kaddr + offset, size);
}
dax_read_unlock(id);
- return 0;
+ return size;
}
static loff_t
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 183299892465..d53fa92a1ab6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
-/**
- * dio_end_io - handle the end io action for the given bio
- * @bio: The direct io bio thats being completed
- *
- * This is meant to be called by any filesystem that uses their own dio_submit_t
- * so that the DIO specific endio actions are dealt with after the filesystem
- * has done it's completion work.
- */
-void dio_end_io(struct bio *bio)
-{
- struct dio *dio = bio->bi_private;
-
- if (dio->is_async)
- dio_bio_end_aio(bio);
- else
- dio_bio_end_io(bio);
-}
-EXPORT_SYMBOL_GPL(dio_end_io);
-
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
@@ -1165,22 +1146,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* the early prefetch in the caller enough time.
*/
- if (align & blocksize_mask) {
- if (bdev)
- blkbits = blksize_bits(bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if (align & blocksize_mask)
- goto out;
- }
-
/* watch out for a 0 len io from a tricksy fs */
if (iov_iter_rw(iter) == READ && !count)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
- retval = -ENOMEM;
if (!dio)
- goto out;
+ return -ENOMEM;
/*
* Believe it or not, zeroing out the page array caused a .5%
* performance regression in a database benchmark. So, we take
@@ -1189,32 +1161,32 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
memset(dio, 0, offsetof(struct dio, pages));
dio->flags = flags;
- if (dio->flags & DIO_LOCKING) {
- if (iov_iter_rw(iter) == READ) {
- struct address_space *mapping =
- iocb->ki_filp->f_mapping;
-
- /* will be released by direct_io_worker */
- inode_lock(inode);
-
- retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
- if (retval) {
- inode_unlock(inode);
- kmem_cache_free(dio_cache, dio);
- goto out;
- }
- }
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
+ /* will be released by direct_io_worker */
+ inode_lock(inode);
}
/* Once we sampled i_size check for reads beyond EOF */
dio->i_size = i_size_read(inode);
if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
- if (dio->flags & DIO_LOCKING)
- inode_unlock(inode);
- kmem_cache_free(dio_cache, dio);
retval = 0;
- goto out;
+ goto fail_dio;
+ }
+
+ if (align & blocksize_mask) {
+ if (bdev)
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ blocksize_mask = (1 << blkbits) - 1;
+ if (align & blocksize_mask)
+ goto fail_dio;
+ }
+
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (retval)
+ goto fail_dio;
}
/*
@@ -1258,14 +1230,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
retval = sb_init_dio_done_wq(dio->inode->i_sb);
}
- if (retval) {
- /*
- * We grab i_mutex only for reads so we don't have
- * to release it here
- */
- kmem_cache_free(dio_cache, dio);
- goto out;
- }
+ if (retval)
+ goto fail_dio;
}
/*
@@ -1368,7 +1334,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else
BUG_ON(retval != -EIOCBQUEUED);
-out:
+ return retval;
+
+fail_dio:
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ)
+ inode_unlock(inode);
+
+ kmem_cache_free(dio_cache, dio);
return retval;
}
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..ee92634196a8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -4,6 +4,7 @@ menuconfig DLM
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
+ select SRCU
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 47f0b98b707f..49c5f9407098 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item,
CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, int check_zero,
+ int *info_field, bool (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
if (rc)
return rc;
- if (check_zero && !x)
+ if (check_cb && check_cb(x))
return -EINVAL;
*cl_field = x;
@@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
return len;
}
-#define CLUSTER_ATTR(name, check_zero) \
+#define CLUSTER_ATTR(name, check_cb) \
static ssize_t cluster_##name##_store(struct config_item *item, \
const char *buf, size_t len) \
{ \
struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
- check_zero, buf, len); \
+ check_cb, buf, len); \
} \
static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
@@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
} \
CONFIGFS_ATTR(cluster_, name);
-CLUSTER_ATTR(tcp_port, 1);
-CLUSTER_ATTR(buffer_size, 1);
-CLUSTER_ATTR(rsbtbl_size, 1);
-CLUSTER_ATTR(recover_timer, 1);
-CLUSTER_ATTR(toss_secs, 1);
-CLUSTER_ATTR(scan_secs, 1);
-CLUSTER_ATTR(log_debug, 0);
-CLUSTER_ATTR(log_info, 0);
-CLUSTER_ATTR(protocol, 0);
-CLUSTER_ATTR(mark, 0);
-CLUSTER_ATTR(timewarn_cs, 1);
-CLUSTER_ATTR(waitwarn_us, 0);
-CLUSTER_ATTR(new_rsb_count, 0);
-CLUSTER_ATTR(recover_callbacks, 0);
+static bool dlm_check_zero(unsigned int x)
+{
+ return !x;
+}
+
+static bool dlm_check_buffer_size(unsigned int x)
+{
+ return (x < DEFAULT_BUFFER_SIZE);
+}
+
+CLUSTER_ATTR(tcp_port, dlm_check_zero);
+CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
+CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
+CLUSTER_ATTR(recover_timer, dlm_check_zero);
+CLUSTER_ATTR(toss_secs, dlm_check_zero);
+CLUSTER_ATTR(scan_secs, dlm_check_zero);
+CLUSTER_ATTR(log_debug, NULL);
+CLUSTER_ATTR(log_info, NULL);
+CLUSTER_ATTR(protocol, NULL);
+CLUSTER_ATTR(mark, NULL);
+CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
+CLUSTER_ATTR(waitwarn_us, NULL);
+CLUSTER_ATTR(new_rsb_count, NULL);
+CLUSTER_ATTR(recover_callbacks, NULL);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
@@ -221,6 +231,7 @@ struct dlm_space {
struct list_head members;
struct mutex members_lock;
int members_count;
+ struct dlm_nodes *nds;
};
struct dlm_comms {
@@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
sp->members_count = 0;
+ sp->nds = nds;
return &sp->group;
fail:
@@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
+ kfree(sp->nds);
kfree(sp);
}
@@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
return 0;
}
-int dlm_comm_mark(int nodeid, unsigned int *mark)
+void dlm_comm_mark(int nodeid, unsigned int *mark)
{
struct dlm_comm *cm;
cm = get_comm(nodeid);
- if (!cm)
- return -ENOENT;
+ if (!cm) {
+ *mark = dlm_config.ci_mark;
+ return;
+ }
- *mark = cm->mark;
- put_comm(cm);
+ if (cm->mark)
+ *mark = cm->mark;
+ else
+ *mark = dlm_config.ci_mark;
- return 0;
+ put_comm(cm);
}
int dlm_our_nodeid(void)
@@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
/* Config file defaults */
#define DEFAULT_TCP_PORT 21064
-#define DEFAULT_BUFFER_SIZE 4096
#define DEFAULT_RSBTBL_SIZE 1024
#define DEFAULT_RECOVER_TIMER 5
#define DEFAULT_TOSS_SECS 10
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f62996cad561..c210250a2581 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,6 +12,8 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+#define DEFAULT_BUFFER_SIZE 4096
+
struct dlm_config_node {
int nodeid;
int weight;
@@ -46,7 +48,7 @@ void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
int dlm_comm_seq(int nodeid, uint32_t *seq);
-int dlm_comm_mark(int nodeid, unsigned int *mark);
+void dlm_comm_mark(int nodeid, unsigned int *mark);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5050fe05769b..79f56f16bc2c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -65,40 +65,6 @@
#define MAX_SEND_MSG_COUNT 25
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-struct cbuf {
- unsigned int base;
- unsigned int len;
- unsigned int mask;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
- cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
- return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
- cb->base = cb->len = 0;
- cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
- cb->len -= n;
- cb->base += n;
- cb->base &= cb->mask;
-}
-
-static bool cbuf_empty(struct cbuf *cb)
-{
- return cb->len == 0;
-}
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
@@ -117,8 +83,6 @@ struct connection {
int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
- struct page *rx_page;
- struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -126,6 +90,10 @@ struct connection {
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
+ unsigned char *rx_buf;
+ int rx_buflen;
+ int rx_leftover;
+ struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue;
static struct workqueue_struct *send_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
-static DEFINE_MUTEX(connections_lock);
-static struct kmem_cache *con_cache;
+static DEFINE_SPINLOCK(connections_lock);
+DEFINE_STATIC_SRCU(connections_srcu);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
@@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
- int r;
+ int r, idx;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid)
+ idx = srcu_read_lock(&connections_srcu);
+ hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
+ if (con->nodeid == nodeid) {
+ srcu_read_unlock(&connections_srcu, idx);
return con;
+ }
}
+ srcu_read_unlock(&connections_srcu, idx);
+
return NULL;
}
@@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid)
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
*/
-static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
- struct connection *con = NULL;
+ struct connection *con, *tmp;
int r;
con = __find_con(nodeid);
if (con || !alloc)
return con;
- con = kmem_cache_zalloc(con_cache, alloc);
+ con = kzalloc(sizeof(*con), alloc);
if (!con)
return NULL;
- r = nodeid_hash(nodeid);
- hlist_add_head(&con->list, &connection_hash[r]);
+ con->rx_buflen = dlm_config.ci_buffer_size;
+ con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
+ if (!con->rx_buf) {
+ kfree(con);
+ return NULL;
+ }
con->nodeid = nodeid;
mutex_init(&con->sock_mutex);
@@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
con->rx_action = zerocon->rx_action;
}
+ r = nodeid_hash(nodeid);
+
+ spin_lock(&connections_lock);
+ /* Because multiple workqueues/threads calls this function it can
+ * race on multiple cpu's. Instead of locking hot path __find_con()
+ * we just check in rare cases of recently added nodes again
+ * under protection of connections_lock. If this is the case we
+ * abort our connection creation and return the existing connection.
+ */
+ tmp = __find_con(nodeid);
+ if (tmp) {
+ spin_unlock(&connections_lock);
+ kfree(con->rx_buf);
+ kfree(con);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&con->list, &connection_hash[r]);
+ spin_unlock(&connections_lock);
+
return con;
}
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i;
- struct hlist_node *n;
+ int i, idx;
struct connection *con;
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
-}
-
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
-{
- struct connection *con;
-
- mutex_lock(&connections_lock);
- con = __nodeid2con(nodeid, allocation);
- mutex_unlock(&connections_lock);
-
- return con;
+ srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other,
/* Will only re-enter once. */
close_connection(con->othercon, false, true, true);
}
- if (con->rx_page) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
- }
+ con->rx_leftover = 0;
con->retries = 0;
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
@@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con)
shutdown_connection(con);
}
+static int con_realloc_receive_buf(struct connection *con, int newlen)
+{
+ unsigned char *newbuf;
+
+ newbuf = kmalloc(newlen, GFP_NOFS);
+ if (!newbuf)
+ return -ENOMEM;
+
+ /* copy any leftover from last receive */
+ if (con->rx_leftover)
+ memmove(newbuf, con->rx_buf, con->rx_leftover);
+
+ /* swap to new buffer space */
+ kfree(con->rx_buf);
+ con->rx_buflen = newlen;
+ con->rx_buf = newbuf;
+
+ return 0;
+}
+
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
- int ret = 0;
- struct msghdr msg = {};
- struct kvec iov[2];
- unsigned len;
- int r;
int call_again_soon = 0;
- int nvec;
+ struct msghdr msg;
+ struct kvec iov;
+ int ret, buflen;
mutex_lock(&con->sock_mutex);
@@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+
if (con->nodeid == 0) {
ret = -EINVAL;
goto out_close;
}
- if (con->rx_page == NULL) {
- /*
- * This doesn't need to be atomic, but I think it should
- * improve performance if it is.
- */
- con->rx_page = alloc_page(GFP_ATOMIC);
- if (con->rx_page == NULL)
+ /* realloc if we get new buffer size to read out */
+ buflen = dlm_config.ci_buffer_size;
+ if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
+ ret = con_realloc_receive_buf(con, buflen);
+ if (ret < 0)
goto out_resched;
- cbuf_init(&con->cb, PAGE_SIZE);
}
- /*
- * iov[0] is the bit of the circular buffer between the current end
- * point (cb.base + cb.len) and the end of the buffer.
- */
- iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
- iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
- iov[1].iov_len = 0;
- nvec = 1;
-
- /*
- * iov[1] is the bit of the circular buffer between the start of the
- * buffer and the start of the currently used section (cb.base)
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
*/
- if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
- iov[1].iov_len = con->cb.base;
- iov[1].iov_base = page_address(con->rx_page);
- nvec = 2;
- }
- len = iov[0].iov_len + iov[1].iov_len;
- iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
+ iov.iov_base = con->rx_buf + con->rx_leftover;
+ iov.iov_len = con->rx_buflen - con->rx_leftover;
- r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
if (ret <= 0)
goto out_close;
- else if (ret == len)
+ else if (ret == iov.iov_len)
call_again_soon = 1;
- cbuf_add(&con->cb, ret);
- ret = dlm_process_incoming_buffer(con->nodeid,
- page_address(con->rx_page),
- con->cb.base, con->cb.len,
- PAGE_SIZE);
- if (ret < 0) {
- log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d",
- ret, page_address(con->rx_page), con->cb.base,
- con->cb.len, r);
- cbuf_eat(&con->cb, r);
- } else {
- cbuf_eat(&con->cb, ret);
- }
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen = ret + con->rx_leftover;
+ ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
+ if (ret < 0)
+ goto out_close;
- if (cbuf_empty(&con->cb) && !call_again_soon) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen - ret;
+ if (con->rx_leftover) {
+ memmove(con->rx_buf, con->rx_buf + ret,
+ con->rx_leftover);
+ call_again_soon = true;
}
if (call_again_soon)
goto out_resched;
+
mutex_unlock(&con->sock_mutex);
return 0;
@@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con)
int nodeid;
struct connection *newcon;
struct connection *addcon;
+ unsigned int mark;
- mutex_lock(&connections_lock);
if (!dlm_allow_conn) {
- mutex_unlock(&connections_lock);
return -1;
}
- mutex_unlock(&connections_lock);
mutex_lock_nested(&con->sock_mutex, 0);
@@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con)
return -1;
}
+ dlm_comm_mark(nodeid, &mark);
+ sock_set_mark(newsock->sk, mark);
+
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
@@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con)
struct connection *othercon = newcon->othercon;
if (!othercon) {
- othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+ othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
result = -ENOMEM;
goto accept_err;
}
+
+ othercon->rx_buflen = dlm_config.ci_buffer_size;
+ othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS);
+ if (!othercon->rx_buf) {
+ mutex_unlock(&newcon->sock_mutex);
+ kfree(othercon);
+ log_print("failed to allocate incoming socket receive buffer");
+ result = -ENOMEM;
+ goto accept_err;
+ }
+
othercon->nodeid = nodeid;
othercon->rx_action = receive_from_sock;
mutex_init(&othercon->sock_mutex);
@@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
/* Some odd races can cause double-connects, ignore them */
@@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result < 0)
goto socket_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto bind_err;
-
sock_set_mark(sock->sk, mark);
con->rx_action = receive_from_sock;
@@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
@@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con)
if (result < 0)
goto out_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto out_err;
-
sock_set_mark(sock->sk, mark);
memset(&saddr, 0, sizeof(saddr));
@@ -1238,6 +1229,14 @@ static void init_local(void)
}
}
+static void deinit_local(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+}
+
/* Initialise SCTP socket and bind to all interfaces */
static int sctp_listen_for_all(void)
{
@@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work)
send_to_sock(con);
}
-
-/* Discard all entries on the write queues */
-static void clean_writequeues(void)
-{
- foreach_conn(clean_one_writequeue);
-}
-
static void work_stop(void)
{
if (recv_workqueue)
@@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con)
con->shutdown_action(con);
}
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ kfree(con->rx_buf);
+ kfree(con);
+}
+
static void free_conn(struct connection *con)
{
close_connection(con, true, true, true);
- if (con->othercon)
- kmem_cache_free(con_cache, con->othercon);
- hlist_del(&con->list);
- kmem_cache_free(con_cache, con);
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
+ call_rcu(&con->othercon->rcu, connection_release);
+ }
+ clean_one_writequeue(con);
+ call_rcu(&con->rcu, connection_release);
}
static void work_flush(void)
{
- int ok;
+ int ok, idx;
int i;
- struct hlist_node *n;
struct connection *con;
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
do {
ok = 1;
foreach_conn(stop_conn);
@@ -1635,9 +1635,10 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_safe(con, n,
- &connection_hash[i], list) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i],
+ list) {
ok &= test_bit(CF_READ_PENDING, &con->flags);
ok &= test_bit(CF_WRITE_PENDING, &con->flags);
if (con->othercon) {
@@ -1648,6 +1649,7 @@ static void work_flush(void)
}
}
}
+ srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
@@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void)
/* Set all the flags to prevent any
socket activity.
*/
- mutex_lock(&connections_lock);
dlm_allow_conn = 0;
- mutex_unlock(&connections_lock);
+
+ if (recv_workqueue)
+ flush_workqueue(recv_workqueue);
+ if (send_workqueue)
+ flush_workqueue(send_workqueue);
+
foreach_conn(shutdown_conn);
work_flush();
- clean_writequeues();
foreach_conn(free_conn);
work_stop();
-
- kmem_cache_destroy(con_cache);
+ deinit_local();
}
int dlm_lowcomms_start(void)
@@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- error = -ENOMEM;
- con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
- __alignof__(struct connection), 0,
- NULL);
- if (!con_cache)
- goto fail;
-
error = work_start();
if (error)
- goto fail_destroy;
+ goto fail;
dlm_allow_conn = 1;
@@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
- if (con) {
- close_connection(con, false, true, true);
- kmem_cache_free(con_cache, con);
- }
-fail_destroy:
- kmem_cache_destroy(con_cache);
+ if (con)
+ free_conn(con);
fail:
return error;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 921322d133e3..fde3a6afe4be 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -22,114 +22,84 @@
* into packets and sends them to the comms layer.
*/
+#include <asm/unaligned.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
#include "midcomms.h"
-
-static void copy_from_cb(void *dst, const void *base, unsigned offset,
- unsigned len, unsigned limit)
-{
- unsigned copy = len;
-
- if ((copy + offset) > limit)
- copy = limit - offset;
- memcpy(dst, base + offset, copy);
- len -= copy;
- if (len)
- memcpy(dst + copy, base, len);
-}
-
/*
* Called from the low-level comms layer to process a buffer of
* commands.
- *
- * Only complete messages are processed here, any "spare" bytes from
- * the end of a buffer are saved and tacked onto the front of the next
- * message that comes in. I doubt this will happen very often but we
- * need to be able to cope with it and I don't want the task to be waiting
- * for packets to come in when there is useful work to be done.
*/
-int dlm_process_incoming_buffer(int nodeid, const void *base,
- unsigned offset, unsigned len, unsigned limit)
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
- union {
- unsigned char __buf[DLM_INBUF_LEN];
- /* this is to force proper alignment on some arches */
- union dlm_packet p;
- } __tmp;
- union dlm_packet *p = &__tmp.p;
- int ret = 0;
- int err = 0;
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
uint16_t msglen;
- uint32_t lockspace;
-
- while (len > sizeof(struct dlm_header)) {
-
- /* Copy just the header to check the total length. The
- message may wrap around the end of the buffer back to the
- start, so we need to use a temp buffer and copy_from_cb. */
-
- copy_from_cb(p, base, offset, sizeof(struct dlm_header),
- limit);
-
- msglen = le16_to_cpu(p->header.h_length);
- lockspace = p->header.h_lockspace;
+ int ret = 0;
- err = -EINVAL;
- if (msglen < sizeof(struct dlm_header))
- break;
- if (p->header.h_cmd == DLM_MSG) {
- if (msglen < sizeof(struct dlm_message))
- break;
- } else {
- if (msglen < sizeof(struct dlm_rcom))
- break;
- }
- err = -E2BIG;
- if (msglen > dlm_config.ci_buffer_size) {
- log_print("message size %d from %d too big, buf len %d",
- msglen, nodeid, len);
- break;
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ /* no message should be more than this otherwise we
+ * cannot deliver this message to upper layers
+ */
+ msglen = get_unaligned_le16(&hd->h_length);
+ if (msglen > DEFAULT_BUFFER_SIZE) {
+ log_print("received invalid length header: %u, will abort message parsing",
+ msglen);
+ return -EBADMSG;
}
- err = 0;
-
- /* If only part of the full message is contained in this
- buffer, then do nothing and wait for lowcomms to call
- us again later with more data. We return 0 meaning
- we've consumed none of the input buffer. */
+ /* caller will take care that leftover
+ * will be parsed next call with more data
+ */
if (msglen > len)
break;
- /* Allocate a larger temp buffer if the full message won't fit
- in the buffer on the stack (which should work for most
- ordinary messages). */
-
- if (msglen > sizeof(__tmp) && p == &__tmp.p) {
- p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
- if (p == NULL)
- return ret;
- }
+ switch (hd->h_cmd) {
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("dlm msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- copy_from_cb(p, base, offset, msglen, limit);
+ break;
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("dlm rcom msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- BUG_ON(lockspace != p->header.h_lockspace);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message",
+ hd->h_cmd);
+ goto skip;
+ }
+ /* for aligned memory access, we just copy current message
+ * to begin of the buffer which contains already parsed buffer
+ * data and should provide align access for upper layers
+ * because the start address of the buffer has a aligned
+ * address. This memmove can be removed when the upperlayer
+ * is capable of unaligned memory access.
+ */
+ memmove(buf, ptr, msglen);
+ dlm_receive_buffer((union dlm_packet *)buf, nodeid);
+
+skip:
ret += msglen;
- offset += msglen;
- offset &= (limit - 1);
len -= msglen;
-
- dlm_receive_buffer(p, nodeid);
+ ptr += msglen;
}
- if (p != &__tmp.p)
- kfree(p);
-
- return err ? err : ret;
+ return ret;
}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 2e122e81c8d0..61e90a921849 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,8 +12,7 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
-int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
- unsigned len, unsigned limit);
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e338c407cb75..67f68d48d60c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -62,7 +62,7 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static const struct genl_ops dlm_nl_ops[] = {
+static const struct genl_small_ops dlm_nl_ops[] = {
{
.cmd = DLM_CMD_HELLO,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -73,8 +73,8 @@ static const struct genl_ops dlm_nl_ops[] = {
static struct genl_family family __ro_after_init = {
.name = DLM_GENL_NAME,
.version = DLM_GENL_VERSION,
- .ops = dlm_nl_ops,
- .n_ops = ARRAY_SIZE(dlm_nl_ops),
+ .small_ops = dlm_nl_ops,
+ .n_small_ops = ARRAY_SIZE(dlm_nl_ops),
.module = THIS_MODULE,
};
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 28bb5689333a..15880a68faad 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+ strreplace(name, '/', '!');
+
inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
is_removable);
if (!inode)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 459ecb42cbd3..347be146884c 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -224,7 +224,7 @@ submit_bio_retry:
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)blknr <<
LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
}
err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ddaa516c008a..b9a09806512a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
enum {
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_cache_strategy,
Opt_err
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..5bde77d70852 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6c939def00f9..50912a5420b4 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend {
struct z_erofs_collector clt;
struct erofs_map_blocks map;
+ bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ enum z_erofs_cache_alloctype type)
{
const struct z_erofs_pcluster *pcl = clt->pcl;
const unsigned int clusterpages = BIT(pcl->clusterbits);
@@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page,
- struct list_head *pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -620,8 +619,7 @@ restart_now:
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
hitted:
/*
@@ -653,7 +651,7 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ alloc_page(GFP_NOFS | __GFP_NOFAIL);
newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
@@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
}
static void z_erofs_submit_queue(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
@@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
+ z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
/* since bio will be NULL, no need to initialize last_index */
pgoff_t last_index;
unsigned int nr_bios = 0;
@@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb,
do {
struct page *page;
- int err;
page = pickup_page_for_submission(pcl, i++, pagepool,
MNGD_MAPPING(sbi),
@@ -1216,11 +1214,12 @@ submit_bio_retry:
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
bio->bi_opf = REQ_OP_READ;
+ if (f->readahead)
+ bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
goto submit_bio_retry;
last_index = cur;
@@ -1248,14 +1247,14 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_collector *clt,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
+ z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page)
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page)
return err;
}
-static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
- unsigned int nr)
-{
- return nr <= sbi->ctx.max_sync_decompress_pages;
-}
-
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- bool sync = should_decompress_synchronously(sbi, readahead_count(rac));
+ unsigned int nr_pages = readahead_count(rac);
+ bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *page, *head = NULL;
LIST_HEAD(pagepool);
- trace_erofs_readpages(inode, readahead_index(rac),
- readahead_count(rac), false);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ f.readahead = true;
f.headoffset = readahead_pos(rac);
while ((page = readahead_page(rac))) {
@@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
@@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8107e06d7f6f..4df61129566d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,8 +218,7 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- struct list_head visited_list_link;
- int visited;
+ u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
@@ -274,6 +273,8 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);
+static u64 loop_check_gen = 0;
+
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
@@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
-/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
-static LIST_HEAD(visited_list);
-
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
@@ -1450,7 +1448,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
- const char *name;
+ struct name_snapshot n;
struct wakeup_source *ws;
if (!epi->ep->ws) {
@@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi)
return -ENOMEM;
}
- name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(NULL, name);
+ take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
+ ws = wakeup_source_register(NULL, n.name.name);
+ release_dentry_name_snapshot(&n);
if (!ws)
return -ENOMEM;
@@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1595,9 +1596,6 @@ error_remove_epi:
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
@@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
- ep->visited = 1;
- list_add(&ep->visited_list_link, &visited_list);
+ ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->visited)
+ if (ep_tovisit->gen == loop_check_gen)
continue;
error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
@@ -2019,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- int ret;
- struct eventpoll *ep_cur, *ep_next;
-
- ret = ep_call_nested(&poll_loop_ncalls,
+ return ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
- /* clear visited list */
- list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
- visited_list_link) {
- ep_cur->visited = 0;
- list_del(&ep_cur->visited_list_link);
- }
- return ret;
}
static void clear_tfile_check_list(void)
@@ -2195,11 +2182,13 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
+ ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
+ loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
@@ -2263,6 +2252,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
+ loop_check_gen++;
mutex_unlock(&epmutex);
}
diff --git a/fs/exec.c b/fs/exec.c
index a91003e28eaa..547a2390baf5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -23,6 +23,7 @@
* formats.
*/
+#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
@@ -62,6 +63,7 @@
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
+#include <linux/io_uring.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -949,137 +951,6 @@ struct file *open_exec(const char *name)
}
EXPORT_SYMBOL(open_exec);
-int kernel_read_file(struct file *file, void **buf, loff_t *size,
- loff_t max_size, enum kernel_read_file_id id)
-{
- loff_t i_size, pos;
- ssize_t bytes = 0;
- int ret;
-
- if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
- return -EINVAL;
-
- ret = deny_write_access(file);
- if (ret)
- return ret;
-
- ret = security_kernel_read_file(file, id);
- if (ret)
- goto out;
-
- i_size = i_size_read(file_inode(file));
- if (i_size <= 0) {
- ret = -EINVAL;
- goto out;
- }
- if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
- ret = -EFBIG;
- goto out;
- }
-
- if (id != READING_FIRMWARE_PREALLOC_BUFFER)
- *buf = vmalloc(i_size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < i_size) {
- bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
- if (bytes < 0) {
- ret = bytes;
- goto out_free;
- }
-
- if (bytes == 0)
- break;
- }
-
- if (pos != i_size) {
- ret = -EIO;
- goto out_free;
- }
-
- ret = security_kernel_post_read_file(file, *buf, i_size, id);
- if (!ret)
- *size = pos;
-
-out_free:
- if (ret < 0) {
- if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
- vfree(*buf);
- *buf = NULL;
- }
- }
-
-out:
- allow_write_access(file);
- return ret;
-}
-EXPORT_SYMBOL_GPL(kernel_read_file);
-
-int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
- loff_t max_size, enum kernel_read_file_id id)
-{
- struct file *file;
- int ret;
-
- if (!path || !*path)
- return -EINVAL;
-
- file = filp_open(path, O_RDONLY, 0);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- ret = kernel_read_file(file, buf, size, max_size, id);
- fput(file);
- return ret;
-}
-EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
-
-int kernel_read_file_from_path_initns(const char *path, void **buf,
- loff_t *size, loff_t max_size,
- enum kernel_read_file_id id)
-{
- struct file *file;
- struct path root;
- int ret;
-
- if (!path || !*path)
- return -EINVAL;
-
- task_lock(&init_task);
- get_fs_root(init_task.fs, &root);
- task_unlock(&init_task);
-
- file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
- path_put(&root);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- ret = kernel_read_file(file, buf, size, max_size, id);
- fput(file);
- return ret;
-}
-EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
-
-int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
- enum kernel_read_file_id id)
-{
- struct fd f = fdget(fd);
- int ret = -EBADF;
-
- if (!f.file)
- goto out;
-
- ret = kernel_read_file(f.file, buf, size, max_size, id);
-out:
- fdput(f);
- return ret;
-}
-EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
-
#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
@@ -1130,11 +1001,24 @@ static int exec_mmap(struct mm_struct *mm)
}
task_lock(tsk);
- active_mm = tsk->active_mm;
membarrier_exec_mmap(mm);
- tsk->mm = mm;
+
+ local_irq_disable();
+ active_mm = tsk->active_mm;
tsk->active_mm = mm;
+ tsk->mm = mm;
+ /*
+ * This prevents preemption while active_mm is being loaded and
+ * it and mm are being updated, which could cause problems for
+ * lazy tlb mm refcounting when these are updated by context
+ * switches. Not all architectures can handle irqs off over
+ * activate_mm yet.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
activate_mm(active_mm, mm);
+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
@@ -1895,6 +1779,11 @@ static int bprm_execve(struct linux_binprm *bprm,
struct files_struct *displaced;
int retval;
+ /*
+ * Cancel any io_uring activity across execve
+ */
+ io_uring_task_cancel();
+
retval = unshare_files(&displaced);
if (retval)
return retval;
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
index 03d0824fc368..5a2f119b7e8c 100644
--- a/fs/exfat/cache.c
+++ b/fs/exfat/cache.c
@@ -17,7 +17,6 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-#define EXFAT_CACHE_VALID 0
#define EXFAT_MAX_CACHE 16
struct exfat_cache {
@@ -61,16 +60,6 @@ void exfat_cache_shutdown(void)
kmem_cache_destroy(exfat_cachep);
}
-void exfat_cache_init_inode(struct inode *inode)
-{
- struct exfat_inode_info *ei = EXFAT_I(inode);
-
- spin_lock_init(&ei->cache_lru_lock);
- ei->nr_caches = 0;
- ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
- INIT_LIST_HEAD(&ei->cache_lru);
-}
-
static inline struct exfat_cache *exfat_cache_alloc(void)
{
return kmem_cache_alloc(exfat_cachep, GFP_NOFS);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 573659bfbc55..916797077aad 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -59,9 +59,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
}
/* read a directory entry from the opened directory */
-static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
+static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, dentries_per_clu_bits = 0;
+ int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
unsigned int type, clu_offset;
sector_t sector;
struct exfat_chain dir, clu;
@@ -70,7 +70,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- unsigned int dentry = ei->rwoffset & 0xFFFFFFFF;
+ unsigned int dentry = EXFAT_B_TO_DEN(*cpos) & 0xFFFFFFFF;
struct buffer_head *bh;
/* check if the given file ID is opened */
@@ -127,6 +127,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
continue;
}
+ num_ext = ep->dentry.file.num_ext;
dir_entry->attr = le16_to_cpu(ep->dentry.file.attr);
exfat_get_entry_time(sbi, &dir_entry->crtime,
ep->dentry.file.create_tz,
@@ -157,12 +158,13 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
return -EIO;
dir_entry->size =
le64_to_cpu(ep->dentry.stream.valid_size);
+ dir_entry->entry = dentry;
brelse(bh);
ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
ei->hint_bmap.clu = clu.dir;
- ei->rwoffset = ++dentry;
+ *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
return 0;
}
@@ -178,7 +180,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
}
dir_entry->namebuf.lfn[0] = '\0';
- ei->rwoffset = dentry;
+ *cpos = EXFAT_DEN_TO_B(dentry);
return 0;
}
@@ -242,12 +244,10 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
if (err)
goto unlock;
get_new:
- ei->rwoffset = EXFAT_B_TO_DEN(cpos);
-
if (cpos >= i_size_read(inode))
goto end_of_dir;
- err = exfat_readdir(inode, &de);
+ err = exfat_readdir(inode, &cpos, &de);
if (err) {
/*
* At least we tried to read a sector. Move cpos to next sector
@@ -262,13 +262,10 @@ get_new:
goto end_of_dir;
}
- cpos = EXFAT_DEN_TO_B(ei->rwoffset);
-
if (!nb->lfn[0])
goto end_of_dir;
- i_pos = ((loff_t)ei->start_clu << 32) |
- ((ei->rwoffset - 1) & 0xffffffff);
+ i_pos = ((loff_t)ei->start_clu << 32) | (de.entry & 0xffffffff);
tmp = exfat_iget(sb, i_pos);
if (tmp) {
inum = tmp->i_ino;
@@ -911,7 +908,6 @@ enum {
/*
* return values:
* >= 0 : return dir entiry position with the name in dir
- * -EEXIST : (root dir, ".") it is the root dir itself
* -ENOENT : entry with the name does not exist
* -EIO : I/O error
*/
@@ -979,11 +975,8 @@ rewind:
if (ei->hint_femp.eidx ==
EXFAT_HINT_NONE ||
candi_empty.eidx <=
- ei->hint_femp.eidx) {
- memcpy(&ei->hint_femp,
- &candi_empty,
- sizeof(candi_empty));
- }
+ ei->hint_femp.eidx)
+ ei->hint_femp = candi_empty;
}
brelse(bh);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 95d717f8620c..b8f0e829ecbd 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -128,7 +128,7 @@ enum {
struct exfat_dentry_namebuf {
char *lfn;
- int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */
+ int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
};
/* unicode name structure */
@@ -248,6 +248,8 @@ struct exfat_sb_info {
struct rcu_head rcu;
};
+#define EXFAT_CACHE_VALID 0
+
/*
* EXFAT file system inode in-memory data
*/
@@ -263,8 +265,6 @@ struct exfat_inode_info {
* the validation of hint_stat.
*/
unsigned int version;
- /* file offset or dentry index for readdir */
- loff_t rwoffset;
/* hint for cluster last accessed */
struct exfat_hint hint_bmap;
@@ -428,7 +428,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops;
/* cache.c */
int exfat_cache_init(void);
void exfat_cache_shutdown(void);
-void exfat_cache_init_inode(struct inode *inode);
void exfat_cache_inval_inode(struct inode *inode);
int exfat_get_cluster(struct inode *inode, unsigned int cluster,
unsigned int *fclus, unsigned int *dclus,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index f41f523a58ad..a92478eabfa4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -208,8 +208,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
/* hint information */
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->hint_bmap.clu = EXFAT_EOF_CLUSTER;
- if (ei->rwoffset > new_size)
- ei->rwoffset = new_size;
/* hint_stat will be used if this is directory. */
ei->hint_stat.eidx = 0;
@@ -229,7 +227,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- unsigned int blocksize = 1 << inode->i_blkbits;
+ unsigned int blocksize = i_blocksize(inode);
loff_t aligned_size;
int err;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 7f90204adef5..730373e0965a 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -114,8 +114,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
unsigned int local_clu_offset = clu_offset;
unsigned int num_to_be_allocated = 0, num_clusters = 0;
- ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi);
-
if (EXFAT_I(inode)->i_size_ondisk > 0)
num_clusters =
EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk,
@@ -556,7 +554,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
struct exfat_inode_info *ei = EXFAT_I(inode);
loff_t size = info->size;
- memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain));
+ ei->dir = info->dir;
ei->entry = info->entry;
ei->attr = info->attr;
ei->start_clu = info->start_clu;
@@ -567,7 +565,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->hint_stat.eidx = 0;
ei->hint_stat.clu = info->start_clu;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- ei->rwoffset = 0;
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->i_pos = 0;
@@ -611,8 +608,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->i_crtime = info->crtime;
inode->i_atime = info->atime;
- exfat_cache_init_inode(inode);
-
return 0;
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index e73f20f66cb2..2932b23a3b6c 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -290,7 +290,7 @@ static int exfat_check_max_dentries(struct inode *inode)
{
if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) {
/*
- * exFAT spec allows a dir to grow upto 8388608(256MB)
+ * exFAT spec allows a dir to grow up to 8388608(256MB)
* dentries
*/
return -ENOSPC;
@@ -318,8 +318,7 @@ static int exfat_find_empty_entry(struct inode *inode,
hint_femp.eidx = EXFAT_HINT_NONE;
if (ei->hint_femp.eidx != EXFAT_HINT_NONE) {
- memcpy(&hint_femp, &ei->hint_femp,
- sizeof(struct exfat_hint_femp));
+ hint_femp = ei->hint_femp;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
}
@@ -519,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
if (ret)
goto out;
- memcpy(&info->dir, p_dir, sizeof(struct exfat_chain));
+ info->dir = *p_dir;
info->entry = dentry;
info->flags = ALLOC_NO_FAT_CHAIN;
info->type = type;
@@ -530,19 +529,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
info->size = 0;
info->num_subdirs = 0;
} else {
- int count;
- struct exfat_chain cdir;
-
info->attr = ATTR_SUBDIR;
info->start_clu = start_clu;
info->size = clu_size;
-
- exfat_chain_set(&cdir, info->start_clu,
- EXFAT_B_TO_CLU(info->size, sbi), info->flags);
- count = exfat_count_dir_entries(sb, &cdir);
- if (count < 0)
- return -EIO;
- info->num_subdirs = count + EXFAT_MIN_SUBDIR;
+ info->num_subdirs = EXFAT_MIN_SUBDIR;
}
memset(&info->crtime, 0, sizeof(info->crtime));
memset(&info->mtime, 0, sizeof(info->mtime));
@@ -578,7 +568,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode))
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
inode_inc_iversion(inode);
@@ -603,6 +594,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es;
if (qname->len == 0)
return -ENOENT;
@@ -628,91 +621,63 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
num_entries, TYPE_ALL);
- if ((dentry < 0) && (dentry != -EEXIST))
+ if (dentry < 0)
return dentry; /* -error value */
- memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain));
+ info->dir = cdir;
info->entry = dentry;
info->num_subdirs = 0;
- /* root directory itself */
- if (unlikely(dentry == -EEXIST)) {
- int num_clu = 0;
+ es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
+ if (!es)
+ return -EIO;
+ ep = exfat_get_dentry_cached(es, 0);
+ ep2 = exfat_get_dentry_cached(es, 1);
+
+ info->type = exfat_get_entry_type(ep);
+ info->attr = le16_to_cpu(ep->dentry.file.attr);
+ info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ if ((info->type == TYPE_FILE) && (info->size == 0)) {
+ info->flags = ALLOC_NO_FAT_CHAIN;
+ info->start_clu = EXFAT_EOF_CLUSTER;
+ } else {
+ info->flags = ep2->dentry.stream.flags;
+ info->start_clu =
+ le32_to_cpu(ep2->dentry.stream.start_clu);
+ }
- info->type = TYPE_DIR;
- info->attr = ATTR_SUBDIR;
- info->flags = ALLOC_FAT_CHAIN;
- info->start_clu = sbi->root_dir;
- memset(&info->crtime, 0, sizeof(info->crtime));
- memset(&info->mtime, 0, sizeof(info->mtime));
- memset(&info->atime, 0, sizeof(info->atime));
-
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- info->size = num_clu << sbi->cluster_size_bits;
+ exfat_get_entry_time(sbi, &info->crtime,
+ ep->dentry.file.create_tz,
+ ep->dentry.file.create_time,
+ ep->dentry.file.create_date,
+ ep->dentry.file.create_time_cs);
+ exfat_get_entry_time(sbi, &info->mtime,
+ ep->dentry.file.modify_tz,
+ ep->dentry.file.modify_time,
+ ep->dentry.file.modify_date,
+ ep->dentry.file.modify_time_cs);
+ exfat_get_entry_time(sbi, &info->atime,
+ ep->dentry.file.access_tz,
+ ep->dentry.file.access_time,
+ ep->dentry.file.access_date,
+ 0);
+ exfat_free_dentry_set(es, false);
+
+ if (ei->start_clu == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
+ i_size_read(dir), ei->dir.dir, ei->entry);
+ return -EIO;
+ }
+ if (info->type == TYPE_DIR) {
+ exfat_chain_set(&cdir, info->start_clu,
+ EXFAT_B_TO_CLU(info->size, sbi), info->flags);
count = exfat_count_dir_entries(sb, &cdir);
if (count < 0)
return -EIO;
- info->num_subdirs = count;
- } else {
- struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
-
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
- if (!es)
- return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
-
- info->type = exfat_get_entry_type(ep);
- info->attr = le16_to_cpu(ep->dentry.file.attr);
- info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
- if ((info->type == TYPE_FILE) && (info->size == 0)) {
- info->flags = ALLOC_NO_FAT_CHAIN;
- info->start_clu = EXFAT_EOF_CLUSTER;
- } else {
- info->flags = ep2->dentry.stream.flags;
- info->start_clu =
- le32_to_cpu(ep2->dentry.stream.start_clu);
- }
-
- if (ei->start_clu == EXFAT_FREE_CLUSTER) {
- exfat_fs_error(sb,
- "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
- i_size_read(dir), ei->dir.dir, ei->entry);
- exfat_free_dentry_set(es, false);
- return -EIO;
- }
-
- exfat_get_entry_time(sbi, &info->crtime,
- ep->dentry.file.create_tz,
- ep->dentry.file.create_time,
- ep->dentry.file.create_date,
- ep->dentry.file.create_time_cs);
- exfat_get_entry_time(sbi, &info->mtime,
- ep->dentry.file.modify_tz,
- ep->dentry.file.modify_time,
- ep->dentry.file.modify_date,
- ep->dentry.file.modify_time_cs);
- exfat_get_entry_time(sbi, &info->atime,
- ep->dentry.file.access_tz,
- ep->dentry.file.access_time,
- ep->dentry.file.access_date,
- 0);
- exfat_free_dentry_set(es, false);
-
- if (info->type == TYPE_DIR) {
- exfat_chain_set(&cdir, info->start_clu,
- EXFAT_B_TO_CLU(info->size, sbi), info->flags);
- count = exfat_count_dir_entries(sb, &cdir);
- if (count < 0)
- return -EIO;
-
- info->num_subdirs = count + EXFAT_MIN_SUBDIR;
- }
+ info->num_subdirs = count + EXFAT_MIN_SUBDIR;
}
return 0;
}
@@ -745,10 +710,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
i_mode = inode->i_mode;
alias = d_find_alias(inode);
@@ -890,10 +854,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
@@ -1066,7 +1029,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
if (!epnew)
return -EIO;
- memcpy(epnew, epold, DENTRY_SIZE);
+ *epnew = *epold;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
@@ -1086,7 +1049,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
}
- memcpy(epnew, epold, DENTRY_SIZE);
+ *epnew = *epold;
exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1131,11 +1094,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
if (!epmov)
return -EIO;
- /* check if the source and target directory is the same */
- if (exfat_get_entry_type(epmov) == TYPE_DIR &&
- le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir)
- return -EINVAL;
-
num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
epmov);
if (num_old_entries < 0)
@@ -1154,7 +1112,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
if (!epnew)
return -EIO;
- memcpy(epnew, epmov, DENTRY_SIZE);
+ *epnew = *epmov;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
@@ -1174,7 +1132,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
return -EIO;
}
- memcpy(epnew, epmov, DENTRY_SIZE);
+ *epnew = *epmov;
exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index a3c927501e67..675d0e7058c5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -11,7 +11,7 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-/* Upcase tabel macro */
+/* Upcase table macro */
#define EXFAT_NUM_UPCASE (2918)
#define UTBL_COUNT (0x10000)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 3b6a1659892f..3ffdce5c7384 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -342,7 +342,6 @@ static int exfat_read_root(struct inode *inode)
ei->flags = ALLOC_FAT_CHAIN;
ei->type = TYPE_DIR;
ei->version = 0;
- ei->rwoffset = 0;
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->hint_stat.eidx = 0;
ei->hint_stat.clu = sbi->root_dir;
@@ -376,7 +375,6 @@ static int exfat_read_root(struct inode *inode)
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
current_time(inode);
exfat_truncate_atime(&inode->i_atime);
- exfat_cache_init_inode(inode);
return 0;
}
@@ -763,6 +761,10 @@ static void exfat_inode_init_once(void *foo)
{
struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+ spin_lock_init(&ei->cache_lru_lock);
+ ei->nr_caches = 0;
+ ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
+ INIT_LIST_HEAD(&ei->cache_lru);
INIT_HLIST_NODE(&ei->i_hash_fat);
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index fa9c951d3471..1f3f4326bf3c 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -189,7 +189,7 @@ static void group_adjust_blocks(struct super_block *sb, int group_no,
/**
* __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root: root of per-filesystem reservation rb tree
+ * @root: root of per-filesystem reservation rb tree
* @verbose: verbose mode
* @fn: function which wishes to dump the reservation map
*
@@ -282,7 +282,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal,
/**
* search_reserve_window()
- * @rb_root: root of reservation tree
+ * @root: root of reservation tree
* @goal: target allocation block
*
* Find the reserved window which includes the goal, or the previous one
@@ -859,7 +859,7 @@ static int find_next_reservable_window(
*
* failed: we failed to find a reservation window in this group
*
- * @rsv: the reservation
+ * @my_rsv: the reservation
*
* @grp_goal: The goal (group-relative). It is where the search for a
* free reservable space should start from.
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 415c21f0e750..11c5c6fe75bb 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -36,7 +36,6 @@
#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
-#include <linux/fiemap.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d82336b1cd4..efe77cffc322 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 523e00d7b392..f9a692c0a66c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1401,7 +1401,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL)
+#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
@@ -1596,8 +1596,8 @@ struct ext4_sb_info {
atomic_t s_warning_count;
atomic_t s_msg_count;
- /* Encryption context for '-o test_dummy_encryption' */
- struct fscrypt_dummy_context s_dummy_enc_ctx;
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df25d38d6539..698ca4a4db5f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -742,6 +742,53 @@ not_found:
return 1;
}
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
- int encrypt = 0;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
-
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
@@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -1162,7 +1171,7 @@ got:
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 153a9fbe1dd0..0d74615fcce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir,
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ea425b49b345..8b2736283481 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1104,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
@@ -1392,10 +1392,9 @@ retry:
return res;
}
-static const union fscrypt_context *
-ext4_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
- return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
}
static bool ext4_has_stable_inodes(struct super_block *sb)
@@ -1414,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .get_dummy_context = ext4_get_dummy_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
@@ -1888,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
+ if (is_remount && !sbi->s_dummy_enc_policy.policy) {
ext4_msg(sb, KERN_WARNING,
"Can't set test_dummy_encryption on remount");
return -1;
}
- err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
+ err = fscrypt_set_test_dummy_encryption(sb, arg->from,
+ &sbi->s_dummy_enc_policy);
if (err) {
if (err == -EEXIST)
ext4_msg(sb, KERN_WARNING,
@@ -4935,7 +4935,7 @@ failed_mount:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index bbd5e7e0632b..5b7ba8f71153 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
@@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- page_cache_readahead_unbounded(inode->i_mapping, NULL,
- index, num_ra_pages, 0);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 217b290ae3a5..306413589827 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
return (void *)f2fs_acl;
fail:
- kvfree(f2fs_acl);
+ kfree(f2fs_acl);
return ERR_PTR(-EINVAL);
}
@@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
acl = NULL;
else
acl = ERR_PTR(retval);
- kvfree(value);
+ kfree(value);
return acl;
}
@@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
- kvfree(value);
+ kfree(value);
if (!error)
set_cached_acl(inode, type, acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ff807e14c891..023462e80e58 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -107,7 +107,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
return __get_meta_page(sbi, index, true);
}
-struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index)
+struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct page *page;
int count = 0;
@@ -243,6 +243,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
+ if (unlikely(blkno >= TOTAL_SEGS(sbi)))
+ goto out;
/* get sit block addr */
fio.new_blkaddr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
@@ -1047,8 +1049,12 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
get_pages(sbi, is_dir ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
retry:
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
return -EIO;
+ }
spin_lock(&sbi->inode_lock[type]);
@@ -1619,11 +1625,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
+ /* save inmem log status */
+ f2fs_save_inmem_curseg(sbi);
+
err = do_checkpoint(sbi, cpc);
if (err)
f2fs_release_discard_addrs(sbi);
else
f2fs_clear_prefree_segments(sbi, cpc);
+
+ f2fs_restore_inmem_curseg(sbi);
stop:
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
@@ -1654,7 +1665,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
}
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
- NR_CURSEG_TYPE - __cp_payload(sbi)) *
+ NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
F2FS_ORPHANS_PER_BLOCK;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 1dfb126a0cb2..14262e0f1cd6 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -17,6 +17,33 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static struct kmem_cache *cic_entry_slab;
+static struct kmem_cache *dic_entry_slab;
+
+static void *page_array_alloc(struct inode *inode, int nr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int size = sizeof(struct page *) * nr;
+
+ if (likely(size <= sbi->page_array_slab_size))
+ return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS);
+ return f2fs_kzalloc(sbi, size, GFP_NOFS);
+}
+
+static void page_array_free(struct inode *inode, void *pages, int nr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int size = sizeof(struct page *) * nr;
+
+ if (!pages)
+ return;
+
+ if (likely(size <= sbi->page_array_slab_size))
+ kmem_cache_free(sbi->page_array_slab, pages);
+ else
+ kfree(pages);
+}
+
struct f2fs_compress_ops {
int (*init_compress_ctx)(struct compress_ctx *cc);
void (*destroy_compress_ctx)(struct compress_ctx *cc);
@@ -130,19 +157,16 @@ struct page *f2fs_compress_control_page(struct page *page)
int f2fs_init_compress_ctx(struct compress_ctx *cc)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
-
- if (cc->nr_rpages)
+ if (cc->rpages)
return 0;
- cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
- cc->log_cluster_size, GFP_NOFS);
+ cc->rpages = page_array_alloc(cc->inode, cc->cluster_size);
return cc->rpages ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
{
- kfree(cc->rpages);
+ page_array_free(cc->inode, cc->rpages, cc->cluster_size);
cc->rpages = NULL;
cc->nr_rpages = 0;
cc->nr_cpages = 0;
@@ -382,16 +406,17 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
ZSTD_DStream *stream;
void *workspace;
unsigned int workspace_size;
+ unsigned int max_window_size =
+ MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size);
- workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE);
+ workspace_size = ZSTD_DStreamWorkspaceBound(max_window_size);
workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
workspace_size, GFP_NOFS);
if (!workspace)
return -ENOMEM;
- stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE,
- workspace, workspace_size);
+ stream = ZSTD_initDStream(max_window_size, workspace, workspace_size);
if (!stream) {
printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
@@ -554,13 +579,29 @@ static void f2fs_compress_free_page(struct page *page)
mempool_free(page, compress_page_pool);
}
+#define MAX_VMAP_RETRIES 3
+
+static void *f2fs_vmap(struct page **pages, unsigned int count)
+{
+ int i;
+ void *buf = NULL;
+
+ for (i = 0; i < MAX_VMAP_RETRIES; i++) {
+ buf = vm_map_ram(pages, count, -1);
+ if (buf)
+ break;
+ vm_unmap_aliases();
+ }
+ return buf;
+}
+
static int f2fs_compress_pages(struct compress_ctx *cc)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct f2fs_inode_info *fi = F2FS_I(cc->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
- unsigned int max_len, nr_cpages;
+ unsigned int max_len, new_nr_cpages;
+ struct page **new_cpages;
int i, ret;
trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
@@ -575,8 +616,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
max_len = COMPRESS_HEADER_SIZE + cc->clen;
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
- cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
- cc->nr_cpages, GFP_NOFS);
+ cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages);
if (!cc->cpages) {
ret = -ENOMEM;
goto destroy_compress_ctx;
@@ -590,13 +630,13 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
}
}
- cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO);
+ cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size);
if (!cc->rbuf) {
ret = -ENOMEM;
goto out_free_cpages;
}
- cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
+ cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages);
if (!cc->cbuf) {
ret = -ENOMEM;
goto out_vunmap_rbuf;
@@ -618,16 +658,28 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
cc->cbuf->reserved[i] = cpu_to_le32(0);
- nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+ new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+ /* Now we're going to cut unnecessary tail pages */
+ new_cpages = page_array_alloc(cc->inode, new_nr_cpages);
+ if (!new_cpages) {
+ ret = -ENOMEM;
+ goto out_vunmap_cbuf;
+ }
/* zero out any unused part of the last page */
memset(&cc->cbuf->cdata[cc->clen], 0,
- (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
+ (new_nr_cpages * PAGE_SIZE) -
+ (cc->clen + COMPRESS_HEADER_SIZE));
- vunmap(cc->cbuf);
- vunmap(cc->rbuf);
+ vm_unmap_ram(cc->cbuf, cc->nr_cpages);
+ vm_unmap_ram(cc->rbuf, cc->cluster_size);
- for (i = nr_cpages; i < cc->nr_cpages; i++) {
+ for (i = 0; i < cc->nr_cpages; i++) {
+ if (i < new_nr_cpages) {
+ new_cpages[i] = cc->cpages[i];
+ continue;
+ }
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
@@ -635,22 +687,24 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
if (cops->destroy_compress_ctx)
cops->destroy_compress_ctx(cc);
- cc->nr_cpages = nr_cpages;
+ page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ cc->cpages = new_cpages;
+ cc->nr_cpages = new_nr_cpages;
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
cc->clen, ret);
return 0;
out_vunmap_cbuf:
- vunmap(cc->cbuf);
+ vm_unmap_ram(cc->cbuf, cc->nr_cpages);
out_vunmap_rbuf:
- vunmap(cc->rbuf);
+ vm_unmap_ram(cc->rbuf, cc->cluster_size);
out_free_cpages:
for (i = 0; i < cc->nr_cpages; i++) {
if (cc->cpages[i])
f2fs_compress_free_page(cc->cpages[i]);
}
- kfree(cc->cpages);
+ page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
destroy_compress_ctx:
if (cops->destroy_compress_ctx)
@@ -677,7 +731,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
if (bio->bi_status || PageError(page))
dic->failed = true;
- if (refcount_dec_not_one(&dic->ref))
+ if (atomic_dec_return(&dic->pending_pages))
return;
trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
@@ -689,8 +743,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
- dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
- dic->cluster_size, GFP_NOFS);
+ dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
if (!dic->tpages) {
ret = -ENOMEM;
goto out_free_dic;
@@ -715,13 +768,13 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
- dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
+ dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
if (!dic->rbuf) {
ret = -ENOMEM;
goto destroy_decompress_ctx;
}
- dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
+ dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
if (!dic->cbuf) {
ret = -ENOMEM;
goto out_vunmap_rbuf;
@@ -738,15 +791,15 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
ret = cops->decompress_pages(dic);
out_vunmap_cbuf:
- vunmap(dic->cbuf);
+ vm_unmap_ram(dic->cbuf, dic->nr_cpages);
out_vunmap_rbuf:
- vunmap(dic->rbuf);
+ vm_unmap_ram(dic->rbuf, dic->cluster_size);
destroy_decompress_ctx:
if (cops->destroy_decompress_ctx)
cops->destroy_decompress_ctx(dic);
out_free_dic:
if (verity)
- refcount_set(&dic->ref, dic->nr_cpages);
+ atomic_set(&dic->pending_pages, dic->nr_cpages);
if (!verity)
f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
ret, false);
@@ -1029,6 +1082,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
{
struct compress_ctx cc = {
+ .inode = inode,
.log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
.cluster_size = F2FS_I(inode)->i_cluster_size,
.rpages = fsdata,
@@ -1132,7 +1186,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
*/
down_read(&sbi->node_write);
} else if (!f2fs_trylock_op(sbi)) {
- return -EAGAIN;
+ goto out_free;
}
set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
@@ -1155,15 +1209,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
fio.version = ni.version;
- cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS);
+ cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS);
if (!cic)
goto out_put_dnode;
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
- refcount_set(&cic->ref, cc->nr_cpages);
- cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
- cc->log_cluster_size, GFP_NOFS);
+ atomic_set(&cic->pending_pages, cc->nr_cpages);
+ cic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
if (!cic->rpages)
goto out_put_cic;
@@ -1257,11 +1310,13 @@ unlock_continue:
spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
+ page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ cc->cpages = NULL;
f2fs_destroy_compress_ctx(cc);
return 0;
out_destroy_crypt:
- kfree(cic->rpages);
+ page_array_free(cc->inode, cic->rpages, cc->cluster_size);
for (--i; i >= 0; i--)
fscrypt_finalize_bounce_page(&cc->cpages[i]);
@@ -1271,7 +1326,7 @@ out_destroy_crypt:
f2fs_put_page(cc->cpages[i], 1);
}
out_put_cic:
- kfree(cic);
+ kmem_cache_free(cic_entry_slab, cic);
out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
@@ -1279,6 +1334,9 @@ out_unlock_op:
up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
+out_free:
+ page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ cc->cpages = NULL;
return -EAGAIN;
}
@@ -1296,7 +1354,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
dec_page_count(sbi, F2FS_WB_DATA);
- if (refcount_dec_not_one(&cic->ref))
+ if (atomic_dec_return(&cic->pending_pages))
return;
for (i = 0; i < cic->nr_rpages; i++) {
@@ -1305,8 +1363,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
end_page_writeback(cic->rpages[i]);
}
- kfree(cic->rpages);
- kfree(cic);
+ page_array_free(cic->inode, cic->rpages, cic->nr_rpages);
+ kmem_cache_free(cic_entry_slab, cic);
}
static int f2fs_write_raw_pages(struct compress_ctx *cc,
@@ -1388,9 +1446,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
struct writeback_control *wbc,
enum iostat_type io_type)
{
- struct f2fs_inode_info *fi = F2FS_I(cc->inode);
- const struct f2fs_compress_ops *cops =
- f2fs_cops[fi->i_compress_algorithm];
int err;
*submitted = 0;
@@ -1405,9 +1460,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
err = f2fs_write_compressed_pages(cc, submitted,
wbc, io_type);
- cops->destroy_compress_ctx(cc);
- kfree(cc->cpages);
- cc->cpages = NULL;
if (!err)
return 0;
f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN);
@@ -1424,25 +1476,23 @@ destroy_out:
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct decompress_io_ctx *dic;
pgoff_t start_idx = start_idx_of_cluster(cc);
int i;
- dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS);
+ dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS);
if (!dic)
return ERR_PTR(-ENOMEM);
- dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
- cc->log_cluster_size, GFP_NOFS);
+ dic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
if (!dic->rpages) {
- kfree(dic);
+ kmem_cache_free(dic_entry_slab, dic);
return ERR_PTR(-ENOMEM);
}
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
- refcount_set(&dic->ref, cc->nr_cpages);
+ atomic_set(&dic->pending_pages, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
dic->log_cluster_size = cc->log_cluster_size;
@@ -1453,8 +1503,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->rpages[i] = cc->rpages[i];
dic->nr_rpages = cc->cluster_size;
- dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
- dic->nr_cpages, GFP_NOFS);
+ dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages);
if (!dic->cpages)
goto out_free;
@@ -1489,7 +1538,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
continue;
f2fs_compress_free_page(dic->tpages[i]);
}
- kfree(dic->tpages);
+ page_array_free(dic->inode, dic->tpages, dic->cluster_size);
}
if (dic->cpages) {
@@ -1498,11 +1547,11 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
continue;
f2fs_compress_free_page(dic->cpages[i]);
}
- kfree(dic->cpages);
+ page_array_free(dic->inode, dic->cpages, dic->nr_cpages);
}
- kfree(dic->rpages);
- kfree(dic);
+ page_array_free(dic->inode, dic->rpages, dic->nr_rpages);
+ kmem_cache_free(dic_entry_slab, dic);
}
void f2fs_decompress_end_io(struct page **rpages,
@@ -1530,3 +1579,76 @@ unlock:
unlock_page(rpage);
}
}
+
+int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ char slab_name[32];
+
+ sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev));
+
+ sbi->page_array_slab_size = sizeof(struct page *) <<
+ F2FS_OPTION(sbi).compress_log_size;
+
+ sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
+ sbi->page_array_slab_size);
+ if (!sbi->page_array_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
+{
+ kmem_cache_destroy(sbi->page_array_slab);
+}
+
+static int __init f2fs_init_cic_cache(void)
+{
+ cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
+ sizeof(struct compress_io_ctx));
+ if (!cic_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+static void f2fs_destroy_cic_cache(void)
+{
+ kmem_cache_destroy(cic_entry_slab);
+}
+
+static int __init f2fs_init_dic_cache(void)
+{
+ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
+ sizeof(struct decompress_io_ctx));
+ if (!dic_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+static void f2fs_destroy_dic_cache(void)
+{
+ kmem_cache_destroy(dic_entry_slab);
+}
+
+int __init f2fs_init_compress_cache(void)
+{
+ int err;
+
+ err = f2fs_init_cic_cache();
+ if (err)
+ goto out;
+ err = f2fs_init_dic_cache();
+ if (err)
+ goto free_cic;
+ return 0;
+free_cic:
+ f2fs_destroy_cic_cache();
+out:
+ return -ENOMEM;
+}
+
+void f2fs_destroy_compress_cache(void)
+{
+ f2fs_destroy_dic_cache();
+ f2fs_destroy_cic_cache();
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 73683e58a08d..be4da52604ed 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio)
dic = (struct decompress_io_ctx *)page_private(page);
if (dic) {
- if (refcount_dec_not_one(&dic->ref))
+ if (atomic_dec_return(&dic->pending_pages))
continue;
f2fs_verify_pages(dic->rpages,
dic->cluster_size);
@@ -517,7 +517,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
zero_user_segment(page, 0, PAGE_SIZE);
SetPagePrivate(page);
- set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
+ set_page_private(page, DUMMY_WRITTEN_PAGE);
lock_page(page);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
f2fs_bug_on(sbi, 1);
@@ -1416,7 +1416,7 @@ alloc:
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
- &sum, seg_type, NULL);
+ &sum, seg_type, NULL);
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
invalidate_mapping_pages(META_MAPPING(sbi),
old_blkaddr, old_blkaddr);
@@ -1803,10 +1803,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- /* Block number less than F2FS MAX BLOCKS */
- if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
- return -EFBIG;
-
return __get_data_block(inode, iblock, bh_result, create,
F2FS_GET_BLOCK_BMAP, NULL,
NO_CHECK_TYPE, create);
@@ -2272,8 +2268,8 @@ submit_and_realloc:
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
dic->failed = true;
- if (refcount_sub_and_test(dic->nr_cpages - i,
- &dic->ref)) {
+ if (!atomic_sub_return(dic->nr_cpages - i,
+ &dic->pending_pages)) {
f2fs_decompress_end_io(dic->rpages,
cc->cluster_size, true,
false);
@@ -3133,6 +3129,8 @@ next:
retry = 0;
}
}
+ if (f2fs_compressed_file(inode))
+ f2fs_destroy_compress_ctx(&cc);
#endif
if (retry) {
index = 0;
@@ -3574,7 +3572,7 @@ static void f2fs_dio_end_io(struct bio *bio)
bio->bi_private = dio->orig_private;
bio->bi_end_io = dio->orig_end_io;
- kvfree(dio);
+ kfree(dio);
bio_endio(bio);
}
@@ -3673,12 +3671,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
err);
if (!do_opu)
set_inode_flag(inode, FI_UPDATE_WRITE);
+ } else if (err == -EIOCBQUEUED) {
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
+ count - iov_iter_count(iter));
} else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
}
} else {
if (err > 0)
f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err);
+ else if (err == -EIOCBQUEUED)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO,
+ count - iov_iter_count(iter));
}
out:
@@ -3807,11 +3811,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
filemap_write_and_wait(mapping);
- if (f2fs_compressed_file(inode))
- blknr = f2fs_bmap_compress(inode, block);
+ /* Block number less than F2FS MAX BLOCKS */
+ if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks))
+ goto out;
- if (!get_data_block_bmap(inode, block, &tmp, 0))
- blknr = tmp.b_blocknr;
+ if (f2fs_compressed_file(inode)) {
+ blknr = f2fs_bmap_compress(inode, block);
+ } else {
+ if (!get_data_block_bmap(inode, block, &tmp, 0))
+ blknr = tmp.b_blocknr;
+ }
out:
trace_f2fs_bmap(inode, block, blknr);
return blknr;
@@ -3874,6 +3883,83 @@ int f2fs_migrate_page(struct address_space *mapping,
#endif
#ifdef CONFIG_SWAP
+static int check_swap_activate_fast(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
+{
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ sector_t cur_lblock;
+ sector_t last_lblock;
+ sector_t pblock;
+ sector_t lowest_pblock = -1;
+ sector_t highest_pblock = 0;
+ int nr_extents = 0;
+ unsigned long nr_pblocks;
+ unsigned long len;
+ int ret;
+
+ /*
+ * Map all the blocks into the extent list. This code doesn't try
+ * to be very smart.
+ */
+ cur_lblock = 0;
+ last_lblock = logical_to_blk(inode, i_size_read(inode));
+ len = i_size_read(inode);
+
+ while (cur_lblock <= last_lblock && cur_lblock < sis->max) {
+ struct buffer_head map_bh;
+ pgoff_t next_pgofs;
+
+ cond_resched();
+
+ memset(&map_bh, 0, sizeof(struct buffer_head));
+ map_bh.b_size = len - cur_lblock;
+
+ ret = get_data_block(inode, cur_lblock, &map_bh, 0,
+ F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
+ if (ret)
+ goto err_out;
+
+ /* hole */
+ if (!buffer_mapped(&map_bh))
+ goto err_out;
+
+ pblock = map_bh.b_blocknr;
+ nr_pblocks = logical_to_blk(inode, map_bh.b_size);
+
+ if (cur_lblock + nr_pblocks >= sis->max)
+ nr_pblocks = sis->max - cur_lblock;
+
+ if (cur_lblock) { /* exclude the header page */
+ if (pblock < lowest_pblock)
+ lowest_pblock = pblock;
+ if (pblock + nr_pblocks - 1 > highest_pblock)
+ highest_pblock = pblock + nr_pblocks - 1;
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
+ cur_lblock += nr_pblocks;
+ }
+ ret = nr_extents;
+ *span = 1 + highest_pblock - lowest_pblock;
+ if (cur_lblock == 0)
+ cur_lblock = 1; /* force Empty message */
+ sis->max = cur_lblock;
+ sis->pages = cur_lblock - 1;
+ sis->highest_bit = cur_lblock - 1;
+out:
+ return ret;
+err_out:
+ pr_err("swapon: swapfile has holes\n");
+ return -EINVAL;
+}
+
/* Copied from generic_swapfile_activate() to check any holes */
static int check_swap_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
@@ -3890,6 +3976,9 @@ static int check_swap_activate(struct swap_info_struct *sis,
int nr_extents = 0;
int ret;
+ if (PAGE_SIZE == F2FS_BLKSIZE)
+ return check_swap_activate_fast(sis, swap_file, span);
+
blkbits = inode->i_blkbits;
blocks_per_page = PAGE_SIZE >> blkbits;
@@ -3989,7 +4078,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
return ret;
- if (f2fs_disable_compressed_file(inode))
+ if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
ret = check_swap_activate(sis, file, span);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 4276c0f79beb..a8357fd4f5fa 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -131,7 +131,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
si->compr_inode = atomic_read(&sbi->compr_inode);
- si->compr_blocks = atomic_read(&sbi->compr_blocks);
+ si->compr_blocks = atomic64_read(&sbi->compr_blocks);
si->append = sbi->im[APPEND_INO].ino_num;
si->update = sbi->im[UPDATE_INO].ino_num;
si->orphans = sbi->im[ORPHAN_INO].ino_num;
@@ -164,7 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
si->util_invalid = 50 - si->util_free - si->util_valid;
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
@@ -342,7 +342,7 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
- seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n",
+ seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n",
si->compr_inode, si->compr_blocks);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
@@ -393,6 +393,14 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_seg[CURSEG_COLD_NODE],
si->full_seg[CURSEG_COLD_NODE],
si->valid_blks[CURSEG_COLD_NODE]);
+ seq_printf(s, " - Pinned file: %8d %8d %8d\n",
+ si->curseg[CURSEG_COLD_DATA_PINNED],
+ si->cursec[CURSEG_COLD_DATA_PINNED],
+ si->curzone[CURSEG_COLD_DATA_PINNED]);
+ seq_printf(s, " - ATGC data: %8d %8d %8d\n",
+ si->curseg[CURSEG_ALL_DATA_ATGC],
+ si->cursec[CURSEG_ALL_DATA_ATGC],
+ si->curzone[CURSEG_ALL_DATA_ATGC]);
seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
si->main_area_segs - si->dirty_count -
si->prefree_count - si->free_segs,
@@ -542,7 +550,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->compr_inode, 0);
- atomic_set(&sbi->compr_blocks, 0);
+ atomic64_set(&sbi->compr_blocks, 0);
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
@@ -566,7 +574,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
- kvfree(si);
+ kfree(si);
}
void __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 069f498af1e3..4b9ef8bbfa4a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -75,21 +75,22 @@ int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
#ifdef CONFIG_UNICODE
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (IS_CASEFOLDED(dir)) {
fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN,
GFP_NOFS);
if (!fname->cf_name.name)
return -ENOMEM;
- fname->cf_name.len = utf8_casefold(sbi->s_encoding,
+ fname->cf_name.len = utf8_casefold(sb->s_encoding,
fname->usr_fname,
fname->cf_name.name,
F2FS_NAME_LEN);
if ((int)fname->cf_name.len <= 0) {
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
- if (f2fs_has_strict_mode(sbi))
+ if (sb_has_strict_encoding(sb))
return -EINVAL;
/* fall back to treating name as opaque byte sequence */
}
@@ -111,7 +112,7 @@ static int __f2fs_setup_filename(const struct inode *dir,
#ifdef CONFIG_FS_ENCRYPTION
fname->crypto_buf = crypt_name->crypto_buf;
#endif
- if (crypt_name->is_ciphertext_name) {
+ if (crypt_name->is_nokey_name) {
/* hash was decoded from the no-key name */
fname->hash = cpu_to_le32(crypt_name->hash);
} else {
@@ -190,21 +191,15 @@ static unsigned long dir_block_index(unsigned int level,
static struct f2fs_dir_entry *find_in_block(struct inode *dir,
struct page *dentry_page,
const struct f2fs_filename *fname,
- int *max_slots,
- struct page **res_page)
+ int *max_slots)
{
struct f2fs_dentry_block *dentry_blk;
- struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
make_dentry_ptr_block(dir, &d, dentry_blk);
- de = f2fs_find_target_dentry(&d, fname, max_slots);
- if (de)
- *res_page = dentry_page;
-
- return de;
+ return f2fs_find_target_dentry(&d, fname, max_slots);
}
#ifdef CONFIG_UNICODE
@@ -215,8 +210,8 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
const u8 *de_name, u32 de_name_len)
{
- const struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
+ const struct super_block *sb = dir->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
struct qstr entry = QSTR_INIT(de_name, de_name_len);
int res;
@@ -226,7 +221,7 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
* In strict mode, ignore invalid names. In non-strict mode,
* fall back to treating them as opaque byte sequences.
*/
- if (f2fs_has_strict_mode(sbi) || name->len != entry.len)
+ if (sb_has_strict_encoding(sb) || name->len != entry.len)
return false;
return !memcmp(name->name, entry.name, name->len);
}
@@ -330,10 +325,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
}
}
- de = find_in_block(dir, dentry_page, fname, &max_slots,
- res_page);
- if (de)
+ de = find_in_block(dir, dentry_page, fname, &max_slots);
+ if (de) {
+ *res_page = dentry_page;
break;
+ }
if (max_slots >= s)
room = true;
@@ -357,16 +353,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
unsigned int max_depth;
unsigned int level;
+ *res_page = NULL;
+
if (f2fs_has_inline_dentry(dir)) {
- *res_page = NULL;
de = f2fs_find_in_inline_dir(dir, fname, res_page);
goto out;
}
- if (npages == 0) {
- *res_page = NULL;
+ if (npages == 0)
goto out;
- }
max_depth = F2FS_I(dir)->i_current_depth;
if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
@@ -377,7 +372,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
}
for (level = 0; level < max_depth; level++) {
- *res_page = NULL;
de = find_in_level(dir, level, fname, res_page);
if (de || IS_ERR(*res_page))
break;
@@ -537,7 +531,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_inherit_context(dir, inode, page, false);
+ err = fscrypt_set_context(inode, page);
if (err)
goto put_error;
}
@@ -1032,7 +1026,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (err)
goto out;
- err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr);
if (err < 0)
goto out;
}
@@ -1107,75 +1101,8 @@ const struct file_operations f2fs_dir_operations = {
};
#ifdef CONFIG_UNICODE
-static int f2fs_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
-{
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *dir = READ_ONCE(parent->d_inode);
- const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- struct qstr entry = QSTR_INIT(str, len);
- char strbuf[DNAME_INLINE_LEN];
- int res;
-
- if (!dir || !IS_CASEFOLDED(dir))
- goto fallback;
-
- /*
- * If the dentry name is stored in-line, then it may be concurrently
- * modified by a rename. If this happens, the VFS will eventually retry
- * the lookup, so it doesn't matter what ->d_compare() returns.
- * However, it's unsafe to call utf8_strncasecmp() with an unstable
- * string. Therefore, we have to copy the name into a temporary buffer.
- */
- if (len <= DNAME_INLINE_LEN - 1) {
- memcpy(strbuf, str, len);
- strbuf[len] = 0;
- entry.name = strbuf;
- /* prevent compiler from optimizing out the temporary buffer */
- barrier();
- }
-
- res = utf8_strncasecmp(sbi->s_encoding, name, &entry);
- if (res >= 0)
- return res;
-
- if (f2fs_has_strict_mode(sbi))
- return -EINVAL;
-fallback:
- if (len != name->len)
- return 1;
- return !!memcmp(str, name->name, len);
-}
-
-static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = READ_ONCE(dentry->d_inode);
- unsigned char *norm;
- int len, ret = 0;
-
- if (!inode || !IS_CASEFOLDED(inode))
- return 0;
-
- norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC);
- if (!norm)
- return -ENOMEM;
-
- len = utf8_casefold(um, str, norm, PATH_MAX);
- if (len < 0) {
- if (f2fs_has_strict_mode(sbi))
- ret = -EINVAL;
- goto out;
- }
- str->hash = full_name_hash(dentry, norm, len);
-out:
- kvfree(norm);
- return ret;
-}
-
const struct dentry_operations f2fs_dentry_ops = {
- .d_hash = f2fs_d_hash,
- .d_compare = f2fs_d_compare,
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
};
#endif
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 686c68b98610..3ebf976a682d 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
return re;
}
+struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi,
+ struct rb_root_cached *root,
+ struct rb_node **parent,
+ unsigned long long key, bool *leftmost)
+{
+ struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_entry *re;
+
+ while (*p) {
+ *parent = *p;
+ re = rb_entry(*parent, struct rb_entry, rb_node);
+
+ if (key < re->key) {
+ p = &(*p)->rb_left;
+ } else {
+ p = &(*p)->rb_right;
+ *leftmost = false;
+ }
+ }
+
+ return p;
+}
+
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
struct rb_root_cached *root,
struct rb_node **parent,
@@ -166,7 +189,7 @@ lookup_neighbors:
}
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
- struct rb_root_cached *root)
+ struct rb_root_cached *root, bool check_key)
{
#ifdef CONFIG_F2FS_CHECK_FS
struct rb_node *cur = rb_first_cached(root), *next;
@@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
cur_re = rb_entry(cur, struct rb_entry, rb_node);
next_re = rb_entry(next, struct rb_entry, rb_node);
+ if (check_key) {
+ if (cur_re->key > next_re->key) {
+ f2fs_info(sbi, "inconsistent rbtree, "
+ "cur(%llu) next(%llu)",
+ cur_re->key, next_re->key);
+ return false;
+ }
+ goto next;
+ }
+
if (cur_re->ofs + cur_re->len > next_re->ofs) {
f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)",
cur_re->ofs, cur_re->len,
next_re->ofs, next_re->len);
return false;
}
-
+next:
cur = next;
}
#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9e52a7f3702..cb700d797296 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_MOUNT_NORECOVERY 0x04000000
+#define F2FS_MOUNT_ATGC 0x08000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -138,7 +139,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
- struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */
+ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */
block_t unusable_cap_perc; /* percentage for cap */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -612,8 +613,13 @@ enum {
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
- unsigned int ofs; /* start offset of the entry */
- unsigned int len; /* length of the entry */
+ union {
+ struct {
+ unsigned int ofs; /* start offset of the entry */
+ unsigned int len; /* length of the entry */
+ };
+ unsigned long long key; /* 64-bits key */
+ } __packed;
};
struct extent_info {
@@ -801,7 +807,7 @@ struct f2fs_inode_info {
struct timespec64 i_disk_time[4];/* inode disk times */
/* for file compress */
- u64 i_compr_blocks; /* # of compressed blocks */
+ atomic_t i_compr_blocks; /* # of compressed blocks */
unsigned char i_compress_algorithm; /* algorithm type */
unsigned char i_log_cluster_size; /* log of cluster size */
unsigned int i_cluster_size; /* cluster size */
@@ -973,7 +979,9 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
*/
#define NR_CURSEG_DATA_TYPE (3)
#define NR_CURSEG_NODE_TYPE (3)
-#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+#define NR_CURSEG_INMEM_TYPE (2)
+#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE)
enum {
CURSEG_HOT_DATA = 0, /* directory entry blocks */
@@ -982,8 +990,11 @@ enum {
CURSEG_HOT_NODE, /* direct node blocks of directory files */
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
- NO_CHECK_TYPE,
- CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
+ NR_PERSISTENT_LOG, /* number of persistent log */
+ CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG,
+ /* pinned file that needs consecutive block address */
+ CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */
+ NO_CHECK_TYPE, /* number of persistent & inmem log */
};
struct flush_cmd {
@@ -1209,6 +1220,7 @@ struct f2fs_dev_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_blkz; /* Total number of zones */
unsigned long *blkz_seq; /* Bitmap indicating sequential zones */
+ block_t *zone_capacity_blocks; /* Array of zone capacity in blks */
#endif
};
@@ -1228,6 +1240,18 @@ struct inode_management {
unsigned long ino_num; /* number of entries */
};
+/* for GC_AT */
+struct atgc_management {
+ bool atgc_enabled; /* ATGC is enabled or not */
+ struct rb_root_cached root; /* root of victim rb-tree */
+ struct list_head victim_list; /* linked with all victim entries */
+ unsigned int victim_count; /* victim count in rb-tree */
+ unsigned int candidate_ratio; /* candidate ratio */
+ unsigned int max_candidate_count; /* max candidate count */
+ unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */
+ unsigned long long age_threshold; /* age threshold */
+};
+
/* For s_flag in struct f2fs_sb_info */
enum {
SBI_IS_DIRTY, /* dirty flag for checkpoint */
@@ -1260,6 +1284,7 @@ enum {
GC_NORMAL,
GC_IDLE_CB,
GC_IDLE_GREEDY,
+ GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
};
@@ -1303,9 +1328,9 @@ enum fsync_mode {
#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
#define IS_ATOMIC_WRITTEN_PAGE(page) \
- (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+ (page_private(page) == ATOMIC_WRITTEN_PAGE)
#define IS_DUMMY_WRITTEN_PAGE(page) \
- (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
+ (page_private(page) == DUMMY_WRITTEN_PAGE)
#ifdef CONFIG_F2FS_IO_TRACE
#define IS_IO_TRACED_PAGE(page) \
@@ -1315,13 +1340,6 @@ enum fsync_mode {
#define IS_IO_TRACED_PAGE(page) (0)
#endif
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) \
- (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL))
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* For compression */
enum compress_algorithm_type {
COMPRESS_LZO,
@@ -1366,7 +1384,7 @@ struct compress_io_ctx {
struct inode *inode; /* inode the context belong to */
struct page **rpages; /* pages store raw data in cluster */
unsigned int nr_rpages; /* total page number in rpages */
- refcount_t ref; /* referrence count of raw page */
+ atomic_t pending_pages; /* in-flight compressed page count */
};
/* decompress io context for read IO path */
@@ -1385,7 +1403,7 @@ struct decompress_io_ctx {
struct compress_data *cbuf; /* virtual mapped address on cpages */
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
- refcount_t ref; /* referrence count of compressed page */
+ atomic_t pending_pages; /* in-flight compressed page count */
bool failed; /* indicate IO error during decompression */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
@@ -1394,7 +1412,7 @@ struct decompress_io_ctx {
#define NULL_CLUSTER ((unsigned int)(~0))
#define MIN_COMPRESS_LOG_SIZE 2
#define MAX_COMPRESS_LOG_SIZE 8
-#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE)
+#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size))
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
@@ -1404,10 +1422,6 @@ struct f2fs_sb_info {
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
struct mutex writepages; /* mutex for writepages() */
-#ifdef CONFIG_UNICODE
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int blocks_per_blkz; /* F2FS blocks per zone */
@@ -1515,6 +1529,7 @@ struct f2fs_sb_info {
* race between GC and GC or CP
*/
struct f2fs_gc_kthread *gc_thread; /* GC thread */
+ struct atgc_management am; /* atgc management */
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
@@ -1551,7 +1566,7 @@ struct f2fs_sb_info {
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t compr_inode; /* # of compressed inodes */
- atomic_t compr_blocks; /* # of compressed blocks */
+ atomic64_t compr_blocks; /* # of compressed blocks */
atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
atomic_t max_vw_cnt; /* max # of volatile writes */
@@ -1600,6 +1615,11 @@ struct f2fs_sb_info {
struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct kmem_cache *page_array_slab; /* page array entry */
+ unsigned int page_array_slab_size; /* default page array slab size */
+#endif
};
struct f2fs_private_dio {
@@ -3332,6 +3352,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi);
int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
+void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
+void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
+void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
+ unsigned int *newseg, bool new_sec, int dir);
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type);
@@ -3350,7 +3375,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
int f2fs_inplace_write_data(struct f2fs_io_info *fio);
void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
- bool recover_curseg, bool recover_newaddr);
+ bool recover_curseg, bool recover_newaddr,
+ bool from_gc);
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
@@ -3378,6 +3404,10 @@ void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp);
+unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
+ unsigned int segno);
+unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
+ unsigned int segno);
/*
* checkpoint.c
@@ -3385,7 +3415,7 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
-struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
@@ -3493,6 +3523,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
unsigned int segno);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
+int __init f2fs_create_garbage_collection_cache(void);
+void f2fs_destroy_garbage_collection_cache(void);
/*
* recovery.c
@@ -3528,7 +3560,8 @@ struct f2fs_stat_info {
int nr_discard_cmd;
unsigned int undiscard_blks;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
- int compr_inode, compr_blocks;
+ int compr_inode;
+ unsigned long long compr_blocks;
int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
@@ -3613,9 +3646,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
(atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \
} while (0)
#define stat_add_compr_blocks(inode, blocks) \
- (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
+ (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
#define stat_sub_compr_blocks(inode, blocks) \
- (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
+ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
#define stat_inc_meta_count(sbi, blkaddr) \
do { \
if (blkaddr < SIT_I(sbi)->sit_base_addr) \
@@ -3794,6 +3827,10 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
*/
struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi,
+ struct rb_root_cached *root,
+ struct rb_node **parent,
+ unsigned long long key, bool *left_most);
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
struct rb_root_cached *root,
struct rb_node **parent,
@@ -3804,7 +3841,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
struct rb_node ***insert_p, struct rb_node **insert_parent,
bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
- struct rb_root_cached *root);
+ struct rb_root_cached *root, bool check_key);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
void f2fs_drop_extent_tree(struct inode *inode);
@@ -3890,6 +3927,10 @@ void f2fs_decompress_end_io(struct page **rpages,
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
+int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi);
+void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
+int __init f2fs_init_compress_cache(void);
+void f2fs_destroy_compress_cache(void);
#else
static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -3906,6 +3947,10 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
+static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
+static inline int __init f2fs_init_compress_cache(void) { return 0; }
+static inline void f2fs_destroy_compress_cache(void) { }
#endif
static inline void set_compress_context(struct inode *inode)
@@ -3924,24 +3969,21 @@ static inline void set_compress_context(struct inode *inode)
f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline u64 f2fs_disable_compressed_file(struct inode *inode)
+static inline bool f2fs_disable_compressed_file(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
if (!f2fs_compressed_file(inode))
- return 0;
- if (S_ISREG(inode->i_mode)) {
- if (get_dirty_pages(inode))
- return 1;
- if (fi->i_compr_blocks)
- return fi->i_compr_blocks;
- }
+ return true;
+ if (S_ISREG(inode->i_mode) &&
+ (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks)))
+ return false;
fi->i_flags &= ~F2FS_COMPR_FL;
stat_dec_compr_inode(inode);
clear_inode_flag(inode, FI_COMPRESSED_FILE);
f2fs_mark_inode_dirty_sync(inode, true);
- return 0;
+ return true;
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
@@ -4022,22 +4064,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- umode_t mode = inode->i_mode;
-
- /*
- * If the directory encrypted or dummy encryption enabled,
- * then we should encrypt the inode.
- */
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#endif
- return false;
-}
-
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
@@ -4051,16 +4077,17 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
u64 blocks, bool add)
{
int diff = F2FS_I(inode)->i_cluster_size - blocks;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
/* don't update i_compr_blocks if saved blocks were released */
- if (!add && !F2FS_I(inode)->i_compr_blocks)
+ if (!add && !atomic_read(&fi->i_compr_blocks))
return;
if (add) {
- F2FS_I(inode)->i_compr_blocks += diff;
+ atomic_add(diff, &fi->i_compr_blocks);
stat_add_compr_blocks(inode, diff);
} else {
- F2FS_I(inode)->i_compr_blocks -= diff;
+ atomic_sub(diff, &fi->i_compr_blocks);
stat_sub_compr_blocks(inode, diff);
}
f2fs_mark_inode_dirty_sync(inode, true);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8a422400e824..ee861c6d9ff0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -376,32 +376,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
return f2fs_do_sync_file(file, start, end, datasync, false);
}
-static pgoff_t __get_first_dirty_index(struct address_space *mapping,
- pgoff_t pgofs, int whence)
-{
- struct page *page;
- int nr_pages;
-
- if (whence != SEEK_DATA)
- return 0;
-
- /* find first dirty page index */
- nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY,
- 1, &page);
- if (!nr_pages)
- return ULONG_MAX;
- pgofs = page->index;
- put_page(page);
- return pgofs;
-}
-
-static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr,
- pgoff_t dirty, pgoff_t pgofs, int whence)
+static bool __found_offset(struct address_space *mapping, block_t blkaddr,
+ pgoff_t index, int whence)
{
switch (whence) {
case SEEK_DATA:
- if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
- __is_valid_data_blkaddr(blkaddr))
+ if (__is_valid_data_blkaddr(blkaddr))
+ return true;
+ if (blkaddr == NEW_ADDR &&
+ xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY))
return true;
break;
case SEEK_HOLE:
@@ -417,7 +400,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
loff_t maxbytes = inode->i_sb->s_maxbytes;
struct dnode_of_data dn;
- pgoff_t pgofs, end_offset, dirty;
+ pgoff_t pgofs, end_offset;
loff_t data_ofs = offset;
loff_t isize;
int err = 0;
@@ -429,16 +412,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
/* handle inline data case */
- if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
- if (whence == SEEK_HOLE)
- data_ofs = isize;
+ if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) {
+ data_ofs = isize;
goto found;
}
pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
- dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
-
for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
@@ -471,7 +451,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
}
- if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty,
+ if (__found_offset(file->f_mapping, blkaddr,
pgofs, whence)) {
f2fs_put_dnode(&dn);
goto found;
@@ -564,7 +544,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
bool compressed_cluster = false;
int cluster_index = 0, valid_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
- bool released = !F2FS_I(dn->inode)->i_compr_blocks;
+ bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks);
if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
base = get_extra_isize(dn->inode);
@@ -753,11 +733,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
return err;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (from != free_from)
+ if (from != free_from) {
err = f2fs_truncate_partial_cluster(inode, from, lock);
+ if (err)
+ return err;
+ }
#endif
- return err;
+ return 0;
}
int f2fs_truncate(struct inode *inode)
@@ -1656,13 +1639,14 @@ next_alloc:
}
down_write(&sbi->pin_sem);
- map.m_seg_type = CURSEG_COLD_DATA_PINNED;
f2fs_lock_op(sbi);
- f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA);
+ f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED);
f2fs_unlock_op(sbi);
+ map.m_seg_type = CURSEG_COLD_DATA_PINNED;
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+
up_write(&sbi->pin_sem);
done += map.m_len;
@@ -1828,7 +1812,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if ((iflags ^ masked_flags) & F2FS_COMPR_FL) {
if (masked_flags & F2FS_COMPR_FL) {
- if (f2fs_disable_compressed_file(inode))
+ if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
}
if (iflags & F2FS_NOCOMP_FL)
@@ -1836,6 +1820,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (iflags & F2FS_COMPR_FL) {
if (!f2fs_may_compress(inode))
return -EINVAL;
+ if (S_ISREG(inode->i_mode) && inode->i_size)
+ return -EINVAL;
set_compress_context(inode);
}
@@ -2783,6 +2769,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst))
return -EOPNOTSUPP;
+ if (pos_out < 0 || pos_in < 0)
+ return -EINVAL;
+
if (src == dst) {
if (pos_in == pos_out)
return 0;
@@ -3258,7 +3247,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
if (ret)
goto out;
- if (f2fs_disable_compressed_file(inode)) {
+ if (!f2fs_disable_compressed_file(inode)) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -3385,7 +3374,7 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
min(FSLABEL_MAX, count)))
err = -EFAULT;
- kvfree(vbuf);
+ kfree(vbuf);
return err;
}
@@ -3436,7 +3425,7 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
if (!f2fs_compressed_file(inode))
return -EINVAL;
- blocks = F2FS_I(inode)->i_compr_blocks;
+ blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks);
return put_user(blocks, (u64 __user *)arg);
}
@@ -3521,7 +3510,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
inode_lock(inode);
writecount = atomic_read(&inode->i_writecount);
- if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) {
+ if ((filp->f_mode & FMODE_WRITE && writecount != 1) ||
+ (!(filp->f_mode & FMODE_WRITE) && writecount)) {
ret = -EBUSY;
goto out;
}
@@ -3540,7 +3530,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
- if (!F2FS_I(inode)->i_compr_blocks)
+ if (!atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -3588,14 +3578,15 @@ out:
if (ret >= 0) {
ret = put_user(released_blocks, (u64 __user *)arg);
- } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) {
+ } else if (released_blocks &&
+ atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
- "iblocks=%llu, released=%u, compr_blocks=%llu, "
+ "iblocks=%llu, released=%u, compr_blocks=%u, "
"run fsck to fix.",
__func__, inode->i_ino, inode->i_blocks,
released_blocks,
- F2FS_I(inode)->i_compr_blocks);
+ atomic_read(&F2FS_I(inode)->i_compr_blocks));
}
return ret;
@@ -3683,7 +3674,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (ret)
return ret;
- if (F2FS_I(inode)->i_compr_blocks)
+ if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -3747,14 +3738,15 @@ out:
if (ret >= 0) {
ret = put_user(reserved_blocks, (u64 __user *)arg);
- } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) {
+ } else if (reserved_blocks &&
+ atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
- "iblocks=%llu, reserved=%u, compr_blocks=%llu, "
+ "iblocks=%llu, reserved=%u, compr_blocks=%u, "
"run fsck to fix.",
__func__, inode->i_ino, inode->i_blocks,
reserved_blocks,
- F2FS_I(inode)->i_compr_blocks);
+ atomic_read(&F2FS_I(inode)->i_compr_blocks));
}
return ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 11b4adde9baf..05641a1e36cc 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -21,6 +21,8 @@
#include "gc.h"
#include <trace/events/f2fs.h>
+static struct kmem_cache *victim_entry_slab;
+
static unsigned int count_bits(const unsigned long *addr,
unsigned int offset, unsigned int len);
@@ -150,7 +152,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
err = PTR_ERR(gc_th->f2fs_gc_task);
- kvfree(gc_th);
+ kfree(gc_th);
sbi->gc_thread = NULL;
}
out:
@@ -163,13 +165,22 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
- kvfree(gc_th);
+ kfree(gc_th);
sbi->gc_thread = NULL;
}
static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
{
- int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+ int gc_mode;
+
+ if (gc_type == BG_GC) {
+ if (sbi->am.atgc_enabled)
+ gc_mode = GC_AT;
+ else
+ gc_mode = GC_CB;
+ } else {
+ gc_mode = GC_GREEDY;
+ }
switch (sbi->gc_mode) {
case GC_IDLE_CB:
@@ -179,7 +190,11 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
case GC_URGENT_HIGH:
gc_mode = GC_GREEDY;
break;
+ case GC_IDLE_AT:
+ gc_mode = GC_AT;
+ break;
}
+
return gc_mode;
}
@@ -193,6 +208,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->dirty_bitmap = dirty_i->dirty_segmap[type];
p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
+ } else if (p->alloc_mode == AT_SSR) {
+ p->gc_mode = GC_GREEDY;
+ p->dirty_bitmap = dirty_i->dirty_segmap[type];
+ p->max_search = dirty_i->nr_dirty[type];
+ p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi, gc_type);
p->ofs_unit = sbi->segs_per_sec;
@@ -212,6 +232,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
*/
if (gc_type != FG_GC &&
(sbi->gc_mode != GC_URGENT_HIGH) &&
+ (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) &&
p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
@@ -229,10 +250,16 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
+ else if (p->alloc_mode == AT_SSR)
+ return UINT_MAX;
+
+ /* LFS */
if (p->gc_mode == GC_GREEDY)
return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
+ else if (p->gc_mode == GC_AT)
+ return UINT_MAX;
else /* No other gc_mode */
return 0;
}
@@ -266,13 +293,14 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned char age = 0;
unsigned char u;
unsigned int i;
+ unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno);
- for (i = 0; i < sbi->segs_per_sec; i++)
+ for (i = 0; i < usable_segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
vblocks = get_valid_blocks(sbi, segno, true);
- mtime = div_u64(mtime, sbi->segs_per_sec);
- vblocks = div_u64(vblocks, sbi->segs_per_sec);
+ mtime = div_u64(mtime, usable_segs_per_sec);
+ vblocks = div_u64(vblocks, usable_segs_per_sec);
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
@@ -297,8 +325,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
return get_valid_blocks(sbi, segno, true);
- else
+ else if (p->gc_mode == GC_CB)
return get_cb_cost(sbi, segno);
+
+ f2fs_bug_on(sbi, 1);
+ return 0;
}
static unsigned int count_bits(const unsigned long *addr,
@@ -313,6 +344,273 @@ static unsigned int count_bits(const unsigned long *addr,
return sum;
}
+static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi,
+ unsigned long long mtime, unsigned int segno,
+ struct rb_node *parent, struct rb_node **p,
+ bool left_most)
+{
+ struct atgc_management *am = &sbi->am;
+ struct victim_entry *ve;
+
+ ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS);
+
+ ve->mtime = mtime;
+ ve->segno = segno;
+
+ rb_link_node(&ve->rb_node, parent, p);
+ rb_insert_color_cached(&ve->rb_node, &am->root, left_most);
+
+ list_add_tail(&ve->list, &am->victim_list);
+
+ am->victim_count++;
+
+ return ve;
+}
+
+static void insert_victim_entry(struct f2fs_sb_info *sbi,
+ unsigned long long mtime, unsigned int segno)
+{
+ struct atgc_management *am = &sbi->am;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool left_most = true;
+
+ p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most);
+ attach_victim_entry(sbi, mtime, segno, parent, p, left_most);
+}
+
+static void add_victim_entry(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p, unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
+ unsigned long long mtime = 0;
+ unsigned int i;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (p->gc_mode == GC_AT &&
+ get_valid_blocks(sbi, segno, true) == 0)
+ return;
+
+ if (p->alloc_mode == AT_SSR &&
+ get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0)
+ return;
+ }
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ mtime += get_seg_entry(sbi, start + i)->mtime;
+ mtime = div_u64(mtime, sbi->segs_per_sec);
+
+ /* Handle if the system time has changed by the user */
+ if (mtime < sit_i->min_mtime)
+ sit_i->min_mtime = mtime;
+ if (mtime > sit_i->max_mtime)
+ sit_i->max_mtime = mtime;
+ if (mtime < sit_i->dirty_min_mtime)
+ sit_i->dirty_min_mtime = mtime;
+ if (mtime > sit_i->dirty_max_mtime)
+ sit_i->dirty_max_mtime = mtime;
+
+ /* don't choose young section as candidate */
+ if (sit_i->dirty_max_mtime - mtime < p->age_threshold)
+ return;
+
+ insert_victim_entry(sbi, mtime, segno);
+}
+
+static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ struct atgc_management *am = &sbi->am;
+ struct rb_node *parent = NULL;
+ bool left_most;
+
+ f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most);
+
+ return parent;
+}
+
+static void atgc_lookup_victim(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct atgc_management *am = &sbi->am;
+ struct rb_root_cached *root = &am->root;
+ struct rb_node *node;
+ struct rb_entry *re;
+ struct victim_entry *ve;
+ unsigned long long total_time;
+ unsigned long long age, u, accu;
+ unsigned long long max_mtime = sit_i->dirty_max_mtime;
+ unsigned long long min_mtime = sit_i->dirty_min_mtime;
+ unsigned int sec_blocks = BLKS_PER_SEC(sbi);
+ unsigned int vblocks;
+ unsigned int dirty_threshold = max(am->max_candidate_count,
+ am->candidate_ratio *
+ am->victim_count / 100);
+ unsigned int age_weight = am->age_weight;
+ unsigned int cost;
+ unsigned int iter = 0;
+
+ if (max_mtime < min_mtime)
+ return;
+
+ max_mtime += 1;
+ total_time = max_mtime - min_mtime;
+
+ accu = div64_u64(ULLONG_MAX, total_time);
+ accu = min_t(unsigned long long, div_u64(accu, 100),
+ DEFAULT_ACCURACY_CLASS);
+
+ node = rb_first_cached(root);
+next:
+ re = rb_entry_safe(node, struct rb_entry, rb_node);
+ if (!re)
+ return;
+
+ ve = (struct victim_entry *)re;
+
+ if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
+ goto skip;
+
+ /* age = 10000 * x% * 60 */
+ age = div64_u64(accu * (max_mtime - ve->mtime), total_time) *
+ age_weight;
+
+ vblocks = get_valid_blocks(sbi, ve->segno, true);
+ f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks);
+
+ /* u = 10000 * x% * 40 */
+ u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) *
+ (100 - age_weight);
+
+ f2fs_bug_on(sbi, age + u >= UINT_MAX);
+
+ cost = UINT_MAX - (age + u);
+ iter++;
+
+ if (cost < p->min_cost ||
+ (cost == p->min_cost && age > p->oldest_age)) {
+ p->min_cost = cost;
+ p->oldest_age = age;
+ p->min_segno = ve->segno;
+ }
+skip:
+ if (iter < dirty_threshold) {
+ node = rb_next(node);
+ goto next;
+ }
+}
+
+/*
+ * select candidates around source section in range of
+ * [target - dirty_threshold, target + dirty_threshold]
+ */
+static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct atgc_management *am = &sbi->am;
+ struct rb_node *node;
+ struct rb_entry *re;
+ struct victim_entry *ve;
+ unsigned long long age;
+ unsigned long long max_mtime = sit_i->dirty_max_mtime;
+ unsigned long long min_mtime = sit_i->dirty_min_mtime;
+ unsigned int seg_blocks = sbi->blocks_per_seg;
+ unsigned int vblocks;
+ unsigned int dirty_threshold = max(am->max_candidate_count,
+ am->candidate_ratio *
+ am->victim_count / 100);
+ unsigned int cost;
+ unsigned int iter = 0;
+ int stage = 0;
+
+ if (max_mtime < min_mtime)
+ return;
+ max_mtime += 1;
+next_stage:
+ node = lookup_central_victim(sbi, p);
+next_node:
+ re = rb_entry_safe(node, struct rb_entry, rb_node);
+ if (!re) {
+ if (stage == 0)
+ goto skip_stage;
+ return;
+ }
+
+ ve = (struct victim_entry *)re;
+
+ if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
+ goto skip_node;
+
+ age = max_mtime - ve->mtime;
+
+ vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks;
+ f2fs_bug_on(sbi, !vblocks);
+
+ /* rare case */
+ if (vblocks == seg_blocks)
+ goto skip_node;
+
+ iter++;
+
+ age = max_mtime - abs(p->age - age);
+ cost = UINT_MAX - vblocks;
+
+ if (cost < p->min_cost ||
+ (cost == p->min_cost && age > p->oldest_age)) {
+ p->min_cost = cost;
+ p->oldest_age = age;
+ p->min_segno = ve->segno;
+ }
+skip_node:
+ if (iter < dirty_threshold) {
+ if (stage == 0)
+ node = rb_prev(node);
+ else if (stage == 1)
+ node = rb_next(node);
+ goto next_node;
+ }
+skip_stage:
+ if (stage < 1) {
+ stage++;
+ iter = 0;
+ goto next_stage;
+ }
+}
+static void lookup_victim_by_age(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
+ &sbi->am.root, true));
+
+ if (p->gc_mode == GC_AT)
+ atgc_lookup_victim(sbi, p);
+ else if (p->alloc_mode == AT_SSR)
+ atssr_lookup_victim(sbi, p);
+ else
+ f2fs_bug_on(sbi, 1);
+}
+
+static void release_victim_entry(struct f2fs_sb_info *sbi)
+{
+ struct atgc_management *am = &sbi->am;
+ struct victim_entry *ve, *tmp;
+
+ list_for_each_entry_safe(ve, tmp, &am->victim_list, list) {
+ list_del(&ve->list);
+ kmem_cache_free(victim_entry_slab, ve);
+ am->victim_count--;
+ }
+
+ am->root = RB_ROOT_CACHED;
+
+ f2fs_bug_on(sbi, am->victim_count);
+ f2fs_bug_on(sbi, !list_empty(&am->victim_list));
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -322,25 +620,37 @@ static unsigned int count_bits(const unsigned long *addr,
* which has minimum valid blocks and removes it from dirty seglist.
*/
static int get_victim_by_default(struct f2fs_sb_info *sbi,
- unsigned int *result, int gc_type, int type, char alloc_mode)
+ unsigned int *result, int gc_type, int type,
+ char alloc_mode, unsigned long long age)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
unsigned int last_segment;
- unsigned int nsearched = 0;
+ unsigned int nsearched;
+ bool is_atgc;
int ret = 0;
mutex_lock(&dirty_i->seglist_lock);
last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
p.alloc_mode = alloc_mode;
- select_policy(sbi, gc_type, type, &p);
+ p.age = age;
+ p.age_threshold = sbi->am.age_threshold;
+retry:
+ select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
+ p.oldest_age = 0;
p.min_cost = get_max_cost(sbi, &p);
+ is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR);
+ nsearched = 0;
+
+ if (is_atgc)
+ SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX;
+
if (*result != NULL_SEGNO) {
if (!get_valid_blocks(sbi, *result, false)) {
ret = -ENODATA;
@@ -421,11 +731,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
/* Don't touch checkpointed data */
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
get_ckpt_valid_blocks(sbi, segno) &&
- p.alloc_mode != SSR))
+ p.alloc_mode == LFS))
goto next;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
+ if (is_atgc) {
+ add_victim_entry(sbi, &p, segno);
+ goto next;
+ }
+
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
@@ -444,6 +759,19 @@ next:
break;
}
}
+
+ /* get victim for GC_AT/AT_SSR */
+ if (is_atgc) {
+ lookup_victim_by_age(sbi, &p);
+ release_victim_entry(sbi);
+ }
+
+ if (is_atgc && p.min_segno == NULL_SEGNO &&
+ sm->elapsed_time < p.age_threshold) {
+ p.age_threshold = 0;
+ goto retry;
+ }
+
if (p.min_segno != NULL_SEGNO) {
got_it:
*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
@@ -536,6 +864,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi,
int phase = 0;
bool fggc = (gc_type == FG_GC);
int submitted = 0;
+ unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
start_addr = START_BLOCK(sbi, segno);
@@ -545,7 +874,7 @@ next_step:
if (fggc && phase == 2)
atomic_inc(&sbi->wb_sync_req[NODE]);
- for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ for (off = 0; off < usable_blks_in_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
struct node_info ni;
@@ -791,6 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
block_t newaddr;
int err = 0;
bool lfs_mode = f2fs_lfs_mode(fio.sbi);
+ int type = fio.sbi->am.atgc_enabled ?
+ CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
@@ -877,7 +1208,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
}
f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
- &sum, CURSEG_COLD_DATA, NULL);
+ &sum, type, NULL);
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -927,7 +1258,7 @@ put_page_out:
recover_block:
if (err)
f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
- true, true);
+ true, true, true);
up_out:
if (lfs_mode)
up_write(&fio.sbi->io_order_lock);
@@ -1033,13 +1364,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
int off;
int phase = 0;
int submitted = 0;
+ unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
- for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ for (off = 0; off < usable_blks_in_seg; off++, entry++) {
struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
@@ -1182,7 +1514,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
down_write(&sit_i->sentry_lock);
ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
- NO_CHECK_TYPE, LFS);
+ NO_CHECK_TYPE, LFS, 0);
up_write(&sit_i->sentry_lock);
return ret;
}
@@ -1204,6 +1536,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
if (__is_large_section(sbi))
end_segno = rounddown(end_segno, sbi->segs_per_sec);
+ /*
+ * zone-capacity can be less than zone-size in zoned devices,
+ * resulting in less than expected usable segments in the zone,
+ * calculate the end segno in the zone which can be garbage collected
+ */
+ if (f2fs_sb_has_blkzoned(sbi))
+ end_segno -= sbi->segs_per_sec -
+ f2fs_usable_segs_in_sec(sbi, segno);
+
+ sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
+
/* readahead multi ssa blocks those have contiguous address */
if (__is_large_section(sbi))
f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
@@ -1356,7 +1699,8 @@ gc_more:
goto stop;
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
- if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
+ if (gc_type == FG_GC &&
+ seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
sec_freed++;
total_freed += seg_freed;
@@ -1413,6 +1757,37 @@ stop:
return ret;
}
+int __init f2fs_create_garbage_collection_cache(void)
+{
+ victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
+ sizeof(struct victim_entry));
+ if (!victim_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void f2fs_destroy_garbage_collection_cache(void)
+{
+ kmem_cache_destroy(victim_entry_slab);
+}
+
+static void init_atgc_management(struct f2fs_sb_info *sbi)
+{
+ struct atgc_management *am = &sbi->am;
+
+ if (test_opt(sbi, ATGC) &&
+ SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD)
+ am->atgc_enabled = true;
+
+ am->root = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&am->victim_list);
+ am->victim_count = 0;
+
+ am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO;
+ am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT;
+ am->age_weight = DEF_GC_THREAD_AGE_WEIGHT;
+}
+
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
{
DIRTY_I(sbi)->v_ops = &default_v_ops;
@@ -1423,6 +1798,8 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi))
SIT_I(sbi)->last_victim[ALLOC_NEXT] =
GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
+
+ init_atgc_management(sbi);
}
static int free_segment_range(struct f2fs_sb_info *sbi,
@@ -1450,7 +1827,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
/* Move out cursegs from the target range */
- for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++)
+ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
f2fs_allocate_segment_for_resize(sbi, type, start, end);
/* do GC to move out valid blocks in the range */
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index db3c61046aa4..0c8dae12dc51 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -14,6 +14,14 @@
#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
+
+/* choose candidates from sections which has age of more than 7 days */
+#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7)
+#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */
+#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */
+#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */
+#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */
+
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
@@ -41,16 +49,69 @@ struct gc_inode_list {
struct radix_tree_root iroot;
};
+struct victim_info {
+ unsigned long long mtime; /* mtime of section */
+ unsigned int segno; /* section No. */
+};
+
+struct victim_entry {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ union {
+ struct {
+ unsigned long long mtime; /* mtime of section */
+ unsigned int segno; /* segment No. */
+ };
+ struct victim_info vi; /* victim info */
+ };
+ struct list_head list;
+};
+
/*
* inline functions
*/
+
+/*
+ * On a Zoned device zone-capacity can be less than zone-size and if
+ * zone-capacity is not aligned to f2fs segment size(2MB), then the segment
+ * starting just before zone-capacity has some blocks spanning across the
+ * zone-capacity, these blocks are not usable.
+ * Such spanning segments can be in free list so calculate the sum of usable
+ * blocks in currently free segments including normal and spanning segments.
+ */
+static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi)
+{
+ block_t free_seg_blks = 0;
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ int j;
+
+ spin_lock(&free_i->segmap_lock);
+ for (j = 0; j < MAIN_SEGS(sbi); j++)
+ if (!test_bit(j, free_i->free_segmap))
+ free_seg_blks += f2fs_usable_blks_in_seg(sbi, j);
+ spin_unlock(&free_i->segmap_lock);
+
+ return free_seg_blks;
+}
+
+static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
+{
+ if (f2fs_sb_has_blkzoned(sbi))
+ return free_segs_blk_count_zoned(sbi);
+
+ return free_segments(sbi) << sbi->log_blocks_per_seg;
+}
+
static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
{
- if (free_segments(sbi) < overprovision_segments(sbi))
+ block_t free_blks, ovp_blks;
+
+ free_blks = free_segs_blk_count(sbi);
+ ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
+
+ if (free_blks < ovp_blks)
return 0;
- else
- return (free_segments(sbi) - overprovision_segments(sbi))
- << sbi->log_blocks_per_seg;
+
+ return free_blks - ovp_blks;
}
static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 102df444f623..70384e31788d 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -524,7 +524,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
!f2fs_has_inline_xattr(dir))
F2FS_I(dir)->i_inline_xattr_size = 0;
- kvfree(backup_dentry);
+ kfree(backup_dentry);
return 0;
recover:
lock_page(ipage);
@@ -535,7 +535,7 @@ recover:
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
- kvfree(backup_dentry);
+ kfree(backup_dentry);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 66969ae852b9..657db2fb6739 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -287,11 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off",
+ __func__, inode->i_ino);
+ return false;
+ }
+
if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
fi->i_flags & F2FS_COMPR_FL &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
if (ri->i_compress_algorithm >= COMPRESS_MAX) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
"compress algorithm: %u, run fsck to fix",
__func__, inode->i_ino,
@@ -300,6 +308,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (le64_to_cpu(ri->i_compr_blocks) >
SECTOR_TO_BLOCK(inode->i_blocks)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
"i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
__func__, inode->i_ino,
@@ -309,6 +318,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
"log cluster size: %u, run fsck to fix",
__func__, inode->i_ino,
@@ -442,7 +452,8 @@ static int do_read_inode(struct inode *inode)
(fi->i_flags & F2FS_COMPR_FL)) {
if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
- fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks);
+ atomic_set(&fi->i_compr_blocks,
+ le64_to_cpu(ri->i_compr_blocks));
fi->i_compress_algorithm = ri->i_compress_algorithm;
fi->i_log_cluster_size = ri->i_log_cluster_size;
fi->i_cluster_size = 1 << fi->i_log_cluster_size;
@@ -460,7 +471,7 @@ static int do_read_inode(struct inode *inode)
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
stat_inc_compr_inode(inode);
- stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
+ stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks));
return 0;
}
@@ -619,7 +630,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
i_log_cluster_size)) {
ri->i_compr_blocks =
- cpu_to_le64(F2FS_I(inode)->i_compr_blocks);
+ cpu_to_le64(atomic_read(
+ &F2FS_I(inode)->i_compr_blocks));
ri->i_compress_algorithm =
F2FS_I(inode)->i_compress_algorithm;
ri->i_log_cluster_size =
@@ -768,7 +780,8 @@ no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
stat_dec_compr_inode(inode);
- stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
+ stat_sub_compr_blocks(inode,
+ atomic_read(&F2FS_I(inode)->i_compr_blocks));
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 84e4bbc1a64d..8fa37d1434de 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
+ bool encrypt = false;
int xattr_size = 0;
int err;
@@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
F2FS_DEF_PROJID);
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
err = dquot_initialize(inode);
if (err)
goto fail_drop;
set_inode_flag(inode, FI_NEW_INODE);
- if (f2fs_may_encrypt(dir, inode))
+ if (encrypt)
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
@@ -707,7 +712,7 @@ out_f2fs_handle_failed_inode:
f2fs_handle_failed_inode(inode);
out_free_encrypted_link:
if (disk_link.name != (unsigned char *)symname)
- kvfree(disk_link.name);
+ kfree(disk_link.name);
return err;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index cb1b5b61a1da..d5d8ce077f29 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page)
static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
- return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid));
+ return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid));
}
static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
@@ -3105,9 +3105,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
- if (!version_bitmap)
- return -EFAULT;
-
nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
GFP_KERNEL);
if (!nm_i->nat_bitmap)
@@ -3257,7 +3254,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kvfree(nm_i->nat_bitmap_mir);
#endif
sbi->nm_info = NULL;
- kvfree(nm_i);
+ kfree(nm_i);
}
int __init f2fs_create_node_manager_caches(void)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index e247a5ef3713..1596502f7375 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -189,7 +189,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
- f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+ f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -728,7 +728,7 @@ init_thread:
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
err = PTR_ERR(fcc->f2fs_issue_flush);
- kvfree(fcc);
+ kfree(fcc);
SM_I(sbi)->fcc_info = NULL;
return err;
}
@@ -747,7 +747,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
kthread_stop(flush_thread);
}
if (free) {
- kvfree(fcc);
+ kfree(fcc);
SM_I(sbi)->fcc_info = NULL;
}
}
@@ -759,6 +759,9 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
if (!f2fs_is_multi_device(sbi))
return 0;
+ if (test_opt(sbi, NOBARRIER))
+ return 0;
+
for (i = 1; i < sbi->s_ndevs; i++) {
if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
continue;
@@ -859,20 +862,22 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned short valid_blocks, ckpt_valid_blocks;
+ unsigned int usable_blocks;
if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
return;
+ usable_blocks = f2fs_usable_blks_in_seg(sbi, segno);
mutex_lock(&dirty_i->seglist_lock);
valid_blocks = get_valid_blocks(sbi, segno, false);
ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno);
if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
- ckpt_valid_blocks == sbi->blocks_per_seg)) {
+ ckpt_valid_blocks == usable_blocks)) {
__locate_dirty_segment(sbi, segno, PRE);
__remove_dirty_segment(sbi, segno, DIRTY);
- } else if (valid_blocks < sbi->blocks_per_seg) {
+ } else if (valid_blocks < usable_blocks) {
__locate_dirty_segment(sbi, segno, DIRTY);
} else {
/* Recovery routine with SSR needs this */
@@ -915,9 +920,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
se = get_seg_entry(sbi, segno);
if (IS_NODESEG(se->type))
- holes[NODE] += sbi->blocks_per_seg - se->valid_blocks;
+ holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) -
+ se->valid_blocks;
else
- holes[DATA] += sbi->blocks_per_seg - se->valid_blocks;
+ holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) -
+ se->valid_blocks;
}
mutex_unlock(&dirty_i->seglist_lock);
@@ -1521,7 +1528,7 @@ retry:
goto next;
if (unlikely(dcc->rbtree_check))
f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
- &dcc->root));
+ &dcc->root, false));
blk_start_plug(&plug);
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
@@ -1958,7 +1965,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
mutex_lock(&dirty_i->seglist_lock);
for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
- __set_test_and_free(sbi, segno);
+ __set_test_and_free(sbi, segno, false);
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -2101,7 +2108,7 @@ init_thread:
"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(dcc->f2fs_issue_discard)) {
err = PTR_ERR(dcc->f2fs_issue_discard);
- kvfree(dcc);
+ kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
return err;
}
@@ -2125,7 +2132,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
f2fs_issue_discard_timeout(sbi);
- kvfree(dcc);
+ kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
}
@@ -2150,6 +2157,39 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
__mark_sit_entry_dirty(sbi, segno);
}
+static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ unsigned int segno = GET_SEGNO(sbi, blkaddr);
+
+ if (segno == NULL_SEGNO)
+ return 0;
+ return get_seg_entry(sbi, segno)->mtime;
+}
+
+static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
+ unsigned long long old_mtime)
+{
+ struct seg_entry *se;
+ unsigned int segno = GET_SEGNO(sbi, blkaddr);
+ unsigned long long ctime = get_mtime(sbi, false);
+ unsigned long long mtime = old_mtime ? old_mtime : ctime;
+
+ if (segno == NULL_SEGNO)
+ return;
+
+ se = get_seg_entry(sbi, segno);
+
+ if (!se->mtime)
+ se->mtime = mtime;
+ else
+ se->mtime = div_u64(se->mtime * se->valid_blocks + mtime,
+ se->valid_blocks + 1);
+
+ if (ctime > SIT_I(sbi)->max_mtime)
+ SIT_I(sbi)->max_mtime = ctime;
+}
+
static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
{
struct seg_entry *se;
@@ -2167,12 +2207,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
f2fs_bug_on(sbi, (new_vblocks < 0 ||
- (new_vblocks > sbi->blocks_per_seg)));
+ (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno))));
se->valid_blocks = new_vblocks;
- se->mtime = get_mtime(sbi, false);
- if (se->mtime > SIT_I(sbi)->max_mtime)
- SIT_I(sbi)->max_mtime = se->mtime;
/* Update valid block bitmap */
if (del > 0) {
@@ -2265,6 +2302,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
/* add it into sit main buffer */
down_write(&sit_i->sentry_lock);
+ update_segment_mtime(sbi, addr, 0);
update_sit_entry(sbi, addr, -1);
/* add it into dirty seglist */
@@ -2344,7 +2382,9 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
*/
struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
{
- return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno));
+ if (unlikely(f2fs_cp_error(sbi)))
+ return ERR_PTR(-EIO);
+ return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno));
}
void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
@@ -2389,9 +2429,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
-static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+static int is_next_segment_free(struct f2fs_sb_info *sbi,
+ struct curseg_info *curseg, int type)
{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
@@ -2495,7 +2535,9 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
struct summary_footer *sum_footer;
+ unsigned short seg_type = curseg->seg_type;
+ curseg->inited = true;
curseg->segno = curseg->next_segno;
curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
curseg->next_blkoff = 0;
@@ -2503,24 +2545,36 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
sum_footer = &(curseg->sum_blk->footer);
memset(sum_footer, 0, sizeof(struct summary_footer));
- if (IS_DATASEG(type))
+
+ sanity_check_seg_type(sbi, seg_type);
+
+ if (IS_DATASEG(seg_type))
SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
- if (IS_NODESEG(type))
+ if (IS_NODESEG(seg_type))
SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
- __set_sit_entry_type(sbi, type, curseg->segno, modified);
+ __set_sit_entry_type(sbi, seg_type, curseg->segno, modified);
}
static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned short seg_type = curseg->seg_type;
+
+ sanity_check_seg_type(sbi, seg_type);
+
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
- return CURSEG_I(sbi, type)->segno;
+ return curseg->segno;
+
+ /* inmem log may not locate on any segment after mount */
+ if (!curseg->inited)
+ return 0;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return 0;
if (test_opt(sbi, NOHEAP) &&
- (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+ (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
return 0;
if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
@@ -2530,7 +2584,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
return 0;
- return CURSEG_I(sbi, type)->segno;
+ return curseg->segno;
}
/*
@@ -2540,12 +2594,14 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned short seg_type = curseg->seg_type;
unsigned int segno = curseg->segno;
int dir = ALLOC_LEFT;
- write_sum_page(sbi, curseg->sum_blk,
+ if (curseg->inited)
+ write_sum_page(sbi, curseg->sum_blk,
GET_SUM_BLOCK(sbi, segno));
- if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
+ if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
if (test_opt(sbi, NOHEAP))
@@ -2594,7 +2650,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type)
+static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2602,8 +2658,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
struct f2fs_summary_block *sum_node;
struct page *sum_page;
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ if (flush)
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+
__set_test_and_inuse(sbi, new_segno);
mutex_lock(&dirty_i->seglist_lock);
@@ -2616,29 +2674,139 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
__next_free_blkoff(sbi, curseg, 0);
sum_page = f2fs_get_sum_page(sbi, new_segno);
- f2fs_bug_on(sbi, IS_ERR(sum_page));
+ if (IS_ERR(sum_page)) {
+ /* GC won't be able to use stale summary pages by cp_error */
+ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
+ return;
+ }
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
f2fs_put_page(sum_page, 1);
}
-static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
+ int alloc_mode, unsigned long long age);
+
+static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
+ int target_type, int alloc_mode,
+ unsigned long long age)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+
+ curseg->seg_type = target_type;
+
+ if (get_ssr_segment(sbi, type, alloc_mode, age)) {
+ struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
+
+ curseg->seg_type = se->type;
+ change_curseg(sbi, type, true);
+ } else {
+ /* allocate cold segment by default */
+ curseg->seg_type = CURSEG_COLD_DATA;
+ new_curseg(sbi, type, true);
+ }
+ stat_inc_seg_type(sbi, curseg);
+}
+
+static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
+
+ if (!sbi->am.atgc_enabled)
+ return;
+
+ down_read(&SM_I(sbi)->curseg_lock);
+
+ mutex_lock(&curseg->curseg_mutex);
+ down_write(&SIT_I(sbi)->sentry_lock);
+
+ get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
+
+ up_write(&SIT_I(sbi)->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+
+ up_read(&SM_I(sbi)->curseg_lock);
+
+}
+void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
+{
+ __f2fs_init_atgc_curseg(sbi);
+}
+
+static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ if (!curseg->inited)
+ goto out;
+
+ if (get_valid_blocks(sbi, curseg->segno, false)) {
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+ } else {
+ mutex_lock(&DIRTY_I(sbi)->seglist_lock);
+ __set_test_and_free(sbi, curseg->segno, true);
+ mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
+ }
+out:
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi)
+{
+ __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
+
+ if (sbi->am.atgc_enabled)
+ __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
+}
+
+static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ if (!curseg->inited)
+ goto out;
+ if (get_valid_blocks(sbi, curseg->segno, false))
+ goto out;
+
+ mutex_lock(&DIRTY_I(sbi)->seglist_lock);
+ __set_test_and_inuse(sbi, curseg->segno);
+ mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
+out:
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi)
+{
+ __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
+
+ if (sbi->am.atgc_enabled)
+ __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
+}
+
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
+ int alloc_mode, unsigned long long age)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
unsigned segno = NULL_SEGNO;
+ unsigned short seg_type = curseg->seg_type;
int i, cnt;
bool reversed = false;
+ sanity_check_seg_type(sbi, seg_type);
+
/* f2fs_need_SSR() already forces to do this */
- if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) {
curseg->next_segno = segno;
return 1;
}
/* For node segments, let's do SSR more intensively */
- if (IS_NODESEG(type)) {
- if (type >= CURSEG_WARM_NODE) {
+ if (IS_NODESEG(seg_type)) {
+ if (seg_type >= CURSEG_WARM_NODE) {
reversed = true;
i = CURSEG_COLD_NODE;
} else {
@@ -2646,7 +2814,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
}
cnt = NR_CURSEG_NODE_TYPE;
} else {
- if (type >= CURSEG_WARM_DATA) {
+ if (seg_type >= CURSEG_WARM_DATA) {
reversed = true;
i = CURSEG_COLD_DATA;
} else {
@@ -2656,9 +2824,9 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
}
for (; cnt-- > 0; reversed ? i-- : i++) {
- if (i == type)
+ if (i == seg_type)
continue;
- if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) {
curseg->next_segno = segno;
return 1;
}
@@ -2687,13 +2855,15 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
if (force)
new_curseg(sbi, type, true);
else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
- type == CURSEG_WARM_NODE)
+ curseg->seg_type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
- else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) &&
+ else if (curseg->alloc_type == LFS &&
+ is_next_segment_free(sbi, curseg, type) &&
likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
new_curseg(sbi, type, false);
- else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type))
- change_curseg(sbi, type);
+ else if (f2fs_need_SSR(sbi) &&
+ get_ssr_segment(sbi, type, SSR, 0))
+ change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
@@ -2714,8 +2884,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
if (segno < start || segno > end)
goto unlock;
- if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type))
- change_curseg(sbi, type);
+ if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
+ change_curseg(sbi, type, true);
else
new_curseg(sbi, type, true);
@@ -2738,11 +2908,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
+ if (!curseg->inited)
+ goto alloc;
+
if (!curseg->next_blkoff &&
!get_valid_blocks(sbi, curseg->segno, false) &&
!get_ckpt_valid_blocks(sbi, curseg->segno))
return;
+alloc:
old_segno = curseg->segno;
SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
locate_dirty_segment(sbi, old_segno);
@@ -2806,7 +2980,7 @@ next:
mutex_lock(&dcc->cmd_lock);
if (unlikely(dcc->rbtree_check))
f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
- &dcc->root));
+ &dcc->root, false));
dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
NULL, start,
@@ -2930,12 +3104,11 @@ out:
return err;
}
-static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
+static bool __has_curseg_space(struct f2fs_sb_info *sbi,
+ struct curseg_info *curseg)
{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (curseg->next_blkoff < sbi->blocks_per_seg)
- return true;
- return false;
+ return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi,
+ curseg->segno);
}
int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
@@ -3075,8 +3248,13 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
- if (is_cold_data(fio->page) || file_is_cold(inode) ||
- f2fs_compressed_file(inode))
+ if (is_cold_data(fio->page)) {
+ if (fio->sbi->am.atgc_enabled)
+ return CURSEG_ALL_DATA_ATGC;
+ else
+ return CURSEG_COLD_DATA;
+ }
+ if (file_is_cold(inode) || f2fs_compressed_file(inode))
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
@@ -3126,27 +3304,25 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
- bool put_pin_sem = false;
-
- if (type == CURSEG_COLD_DATA) {
- /* GC during CURSEG_COLD_DATA_PINNED allocation */
- if (down_read_trylock(&sbi->pin_sem)) {
- put_pin_sem = true;
- } else {
- type = CURSEG_WARM_DATA;
- curseg = CURSEG_I(sbi, type);
- }
- } else if (type == CURSEG_COLD_DATA_PINNED) {
- type = CURSEG_COLD_DATA;
- }
+ unsigned long long old_mtime;
+ bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
+ struct seg_entry *se = NULL;
down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
+ if (from_gc) {
+ f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
+ se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
+ sanity_check_seg_type(sbi, se->type);
+ f2fs_bug_on(sbi, IS_NODESEG(se->type));
+ }
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
+
f2fs_wait_discard_bio(sbi, *new_blkaddr);
/*
@@ -3160,6 +3336,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
stat_inc_block_count(sbi, curseg);
+ if (from_gc) {
+ old_mtime = get_segment_mtime(sbi, old_blkaddr);
+ } else {
+ update_segment_mtime(sbi, old_blkaddr, 0);
+ old_mtime = 0;
+ }
+ update_segment_mtime(sbi, *new_blkaddr, old_mtime);
+
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
@@ -3168,9 +3352,13 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1);
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
-
+ if (!__has_curseg_space(sbi, curseg)) {
+ if (from_gc)
+ get_atssr_segment(sbi, type, se->type,
+ AT_SSR, se->mtime);
+ else
+ sit_i->s_ops->allocate_segment(sbi, type, false);
+ }
/*
* segment dirty status should be updated after segment allocation,
* so we just need to update status only one time after previous
@@ -3204,9 +3392,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
up_read(&SM_I(sbi)->curseg_lock);
-
- if (put_pin_sem)
- up_read(&sbi->pin_sem);
}
static void update_device_state(struct f2fs_io_info *fio)
@@ -3355,7 +3540,8 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
- bool recover_curseg, bool recover_newaddr)
+ bool recover_curseg, bool recover_newaddr,
+ bool from_gc)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
@@ -3400,17 +3586,22 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type);
+ change_curseg(sbi, type, true);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- if (!recover_curseg || recover_newaddr)
+ if (!recover_curseg || recover_newaddr) {
+ if (!from_gc)
+ update_segment_mtime(sbi, new_blkaddr, 0);
update_sit_entry(sbi, new_blkaddr, 1);
+ }
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
invalidate_mapping_pages(META_MAPPING(sbi),
old_blkaddr, old_blkaddr);
+ if (!from_gc)
+ update_segment_mtime(sbi, old_blkaddr, 0);
update_sit_entry(sbi, old_blkaddr, -1);
}
@@ -3422,7 +3613,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type);
+ change_curseg(sbi, type, true);
}
curseg->next_blkoff = old_blkoff;
}
@@ -3442,7 +3633,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
set_summary(&sum, dn->nid, dn->ofs_in_node, version);
f2fs_do_replace_block(sbi, &sum, old_addr, new_addr,
- recover_curseg, recover_newaddr);
+ recover_curseg, recover_newaddr, false);
f2fs_update_data_blkaddr(dn, new_addr);
}
@@ -3574,7 +3765,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
CURSEG_HOT_DATA]);
if (__exist_node_summaries(sbi))
- blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type);
else
blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
} else {
@@ -3652,8 +3843,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
}
if (__exist_node_summaries(sbi))
- f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
- NR_CURSEG_TYPE - type, META_CP, true);
+ f2fs_ra_meta_pages(sbi,
+ sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type),
+ NR_CURSEG_PERSIST_TYPE - type, META_CP, true);
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
@@ -3781,7 +3973,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
- return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno));
+ return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno));
}
static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
@@ -4155,14 +4347,14 @@ static int build_curseg(struct f2fs_sb_info *sbi)
struct curseg_info *array;
int i;
- array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)),
- GFP_KERNEL);
+ array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE,
+ sizeof(*array)), GFP_KERNEL);
if (!array)
return -ENOMEM;
SM_I(sbi)->curseg_array = array;
- for (i = 0; i < NR_CURSEG_TYPE; i++) {
+ for (i = 0; i < NO_CHECK_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
@@ -4172,8 +4364,15 @@ static int build_curseg(struct f2fs_sb_info *sbi)
sizeof(struct f2fs_journal), GFP_KERNEL);
if (!array[i].journal)
return -ENOMEM;
+ if (i < NR_PERSISTENT_LOG)
+ array[i].seg_type = CURSEG_HOT_DATA + i;
+ else if (i == CURSEG_COLD_DATA_PINNED)
+ array[i].seg_type = CURSEG_COLD_DATA;
+ else if (i == CURSEG_ALL_DATA_ATGC)
+ array[i].seg_type = CURSEG_COLD_DATA;
array[i].segno = NULL_SEGNO;
array[i].next_blkoff = 0;
+ array[i].inited = false;
}
return restore_curseg_summaries(sbi);
}
@@ -4294,9 +4493,12 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
{
unsigned int start;
int type;
+ struct seg_entry *sentry;
for (start = 0; start < MAIN_SEGS(sbi); start++) {
- struct seg_entry *sentry = get_seg_entry(sbi, start);
+ if (f2fs_usable_blks_in_seg(sbi, start) == 0)
+ continue;
+ sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
else
@@ -4316,7 +4518,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno = 0, offset = 0, secno;
- block_t valid_blocks;
+ block_t valid_blocks, usable_blks_in_seg;
block_t blks_per_sec = BLKS_PER_SEC(sbi);
while (1) {
@@ -4326,9 +4528,10 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, false);
- if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+ usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
+ if (valid_blocks == usable_blks_in_seg || !valid_blocks)
continue;
- if (valid_blocks > sbi->blocks_per_seg) {
+ if (valid_blocks > usable_blks_in_seg) {
f2fs_bug_on(sbi, 1);
continue;
}
@@ -4408,11 +4611,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
* In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr;
* In LFS curseg, all blkaddr after .next_blkoff should be unused.
*/
- for (i = 0; i < NO_CHECK_TYPE; i++) {
+ for (i = 0; i < NR_PERSISTENT_LOG; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
unsigned int blkofs = curseg->next_blkoff;
+ sanity_check_seg_type(sbi, curseg->seg_type);
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
@@ -4637,7 +4842,7 @@ int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
{
int i, ret;
- for (i = 0; i < NO_CHECK_TYPE; i++) {
+ for (i = 0; i < NR_PERSISTENT_LOG; i++) {
ret = fix_curseg_write_pointer(sbi, i);
if (ret)
return ret;
@@ -4678,6 +4883,101 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
return 0;
}
+
+static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx,
+ unsigned int dev_idx)
+{
+ if (!bdev_is_zoned(FDEV(dev_idx).bdev))
+ return true;
+ return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq);
+}
+
+/* Return the zone index in the given device */
+static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno,
+ int dev_idx)
+{
+ block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
+
+ return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >>
+ sbi->log_blocks_per_blkz;
+}
+
+/*
+ * Return the usable segments in a section based on the zone's
+ * corresponding zone capacity. Zone is equal to a section.
+ */
+static inline unsigned int f2fs_usable_zone_segs_in_sec(
+ struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ unsigned int dev_idx, zone_idx, unusable_segs_in_sec;
+
+ dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno));
+ zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx);
+
+ /* Conventional zone's capacity is always equal to zone size */
+ if (is_conv_zone(sbi, zone_idx, dev_idx))
+ return sbi->segs_per_sec;
+
+ /*
+ * If the zone_capacity_blocks array is NULL, then zone capacity
+ * is equal to the zone size for all zones
+ */
+ if (!FDEV(dev_idx).zone_capacity_blocks)
+ return sbi->segs_per_sec;
+
+ /* Get the segment count beyond zone capacity block */
+ unusable_segs_in_sec = (sbi->blocks_per_blkz -
+ FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >>
+ sbi->log_blocks_per_seg;
+ return sbi->segs_per_sec - unusable_segs_in_sec;
+}
+
+/*
+ * Return the number of usable blocks in a segment. The number of blocks
+ * returned is always equal to the number of blocks in a segment for
+ * segments fully contained within a sequential zone capacity or a
+ * conventional zone. For segments partially contained in a sequential
+ * zone capacity, the number of usable blocks up to the zone capacity
+ * is returned. 0 is returned in all other cases.
+ */
+static inline unsigned int f2fs_usable_zone_blks_in_seg(
+ struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr;
+ unsigned int zone_idx, dev_idx, secno;
+
+ secno = GET_SEC_FROM_SEG(sbi, segno);
+ seg_start = START_BLOCK(sbi, segno);
+ dev_idx = f2fs_target_device_index(sbi, seg_start);
+ zone_idx = get_zone_idx(sbi, secno, dev_idx);
+
+ /*
+ * Conventional zone's capacity is always equal to zone size,
+ * so, blocks per segment is unchanged.
+ */
+ if (is_conv_zone(sbi, zone_idx, dev_idx))
+ return sbi->blocks_per_seg;
+
+ if (!FDEV(dev_idx).zone_capacity_blocks)
+ return sbi->blocks_per_seg;
+
+ sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
+ sec_cap_blkaddr = sec_start_blkaddr +
+ FDEV(dev_idx).zone_capacity_blocks[zone_idx];
+
+ /*
+ * If segment starts before zone capacity and spans beyond
+ * zone capacity, then usable blocks are from seg start to
+ * zone capacity. If the segment starts after the zone capacity,
+ * then there are no usable blocks.
+ */
+ if (seg_start >= sec_cap_blkaddr)
+ return 0;
+ if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
+ return sec_cap_blkaddr - seg_start;
+
+ return sbi->blocks_per_seg;
+}
#else
int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
{
@@ -4688,7 +4988,36 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
{
return 0;
}
+
+static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ return 0;
+}
+
+static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ return 0;
+}
#endif
+unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ if (f2fs_sb_has_blkzoned(sbi))
+ return f2fs_usable_zone_blks_in_seg(sbi, segno);
+
+ return sbi->blocks_per_seg;
+}
+
+unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ if (f2fs_sb_has_blkzoned(sbi))
+ return f2fs_usable_zone_segs_in_sec(sbi, segno);
+
+ return sbi->segs_per_sec;
+}
/*
* Update min, max modified time for cost-benefit GC algorithm
@@ -4715,6 +5044,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = mtime;
}
sit_i->max_mtime = get_mtime(sbi, false);
+ sit_i->dirty_max_mtime = 0;
up_write(&sit_i->sentry_lock);
}
@@ -4830,7 +5160,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
destroy_victim_secmap(sbi);
SM_I(sbi)->dirty_info = NULL;
- kvfree(dirty_i);
+ kfree(dirty_i);
}
static void destroy_curseg(struct f2fs_sb_info *sbi)
@@ -4842,10 +5172,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
return;
SM_I(sbi)->curseg_array = NULL;
for (i = 0; i < NR_CURSEG_TYPE; i++) {
- kvfree(array[i].sum_blk);
- kvfree(array[i].journal);
+ kfree(array[i].sum_blk);
+ kfree(array[i].journal);
}
- kvfree(array);
+ kfree(array);
}
static void destroy_free_segmap(struct f2fs_sb_info *sbi)
@@ -4856,7 +5186,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = NULL;
kvfree(free_i->free_segmap);
kvfree(free_i->free_secmap);
- kvfree(free_i);
+ kfree(free_i);
}
static void destroy_sit_info(struct f2fs_sb_info *sbi)
@@ -4868,7 +5198,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
if (sit_i->sentries)
kvfree(sit_i->bitmap);
- kvfree(sit_i->tmp_map);
+ kfree(sit_i->tmp_map);
kvfree(sit_i->sentries);
kvfree(sit_i->sec_entries);
@@ -4880,7 +5210,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
kvfree(sit_i->sit_bitmap_mir);
kvfree(sit_i->invalid_segmap);
#endif
- kvfree(sit_i);
+ kfree(sit_i);
}
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
@@ -4896,7 +5226,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
destroy_free_segmap(sbi);
destroy_sit_info(sbi);
sbi->sm_info = NULL;
- kvfree(sm_info);
+ kfree(sm_info);
}
int __init f2fs_create_segment_manager_caches(void)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 752b177073b2..e81eb0748e2a 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -16,13 +16,20 @@
#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
+#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
+
+static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
+ unsigned short seg_type)
+{
+ f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
+}
#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
@@ -34,7 +41,9 @@
((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno))
#define IS_CURSEC(sbi, secno) \
(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
@@ -48,7 +57,11 @@
((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
(sbi)->segs_per_sec) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- (sbi)->segs_per_sec)) \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
+ (sbi)->segs_per_sec))
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
@@ -132,20 +145,25 @@ enum {
* In the victim_sel_policy->alloc_mode, there are two block allocation modes.
* LFS writes data sequentially with cleaning operations.
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
+ * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into
+ * fragmented segment which has similar aging degree.
*/
enum {
LFS = 0,
- SSR
+ SSR,
+ AT_SSR,
};
/*
* In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
* GC_CB is based on cost-benefit algorithm.
* GC_GREEDY is based on greedy algorithm.
+ * GC_AT is based on age-threshold algorithm.
*/
enum {
GC_CB = 0,
GC_GREEDY,
+ GC_AT,
ALLOC_NEXT,
FLUSH_DEVICE,
MAX_GC_POLICY,
@@ -174,7 +192,10 @@ struct victim_sel_policy {
unsigned int offset; /* last scanned bitmap offset */
unsigned int ofs_unit; /* bitmap search unit */
unsigned int min_cost; /* minimum cost */
+ unsigned long long oldest_age; /* oldest age of segments having the same min cost */
unsigned int min_segno; /* segment # having min. cost */
+ unsigned long long age; /* mtime of GCed section*/
+ unsigned long long age_threshold;/* age threshold */
};
struct seg_entry {
@@ -240,6 +261,8 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
+ unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */
+ unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */
unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
@@ -278,7 +301,7 @@ struct dirty_seglist_info {
/* victim selection function for cleaning and SSR */
struct victim_selection {
int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
- int, int, char);
+ int, int, char, unsigned long long);
};
/* for active log information */
@@ -288,10 +311,12 @@ struct curseg_info {
struct rw_semaphore journal_rwsem; /* protect journal area */
struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
+ unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */
unsigned int segno; /* current segment number */
unsigned short next_blkoff; /* next block offset to write */
unsigned int zone; /* current zone number */
unsigned int next_segno; /* preallocated segment */
+ bool inited; /* indicate inmem log is inited */
};
struct sit_entry_set {
@@ -305,8 +330,6 @@ struct sit_entry_set {
*/
static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
{
- if (type == CURSEG_COLD_DATA_PINNED)
- type = CURSEG_COLD_DATA;
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
@@ -411,6 +434,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
+ unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
@@ -418,7 +442,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
next = find_next_bit(free_i->free_segmap,
start_segno + sbi->segs_per_sec, start_segno);
- if (next >= start_segno + sbi->segs_per_sec) {
+ if (next >= start_segno + usable_segs) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
}
@@ -438,22 +462,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
}
static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
- unsigned int segno)
+ unsigned int segno, bool inmem)
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
+ unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
spin_lock(&free_i->segmap_lock);
if (test_and_clear_bit(segno, free_i->free_segmap)) {
free_i->free_segments++;
- if (IS_CURSEC(sbi, secno))
+ if (!inmem && IS_CURSEC(sbi, secno))
goto skip_free;
next = find_next_bit(free_i->free_segmap,
start_segno + sbi->segs_per_sec, start_segno);
- if (next >= start_segno + sbi->segs_per_sec) {
+ if (next >= start_segno + usable_segs) {
if (test_and_clear_bit(secno, free_i->free_secmap))
free_i->free_sections++;
}
@@ -500,7 +525,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
return FREE_I(sbi)->free_segments;
}
-static inline int reserved_segments(struct f2fs_sb_info *sbi)
+static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi)
{
return SM_I(sbi)->reserved_segments;
}
@@ -532,7 +557,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
- return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
+ return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
@@ -546,8 +571,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
/* check current node segment */
for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
segno = CURSEG_I(sbi, i)->segno;
- left_blocks = sbi->blocks_per_seg -
- get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
+ get_seg_entry(sbi, segno)->ckpt_valid_blocks;
if (node_blocks > left_blocks)
return false;
@@ -555,7 +580,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
/* check current data segment */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
- left_blocks = sbi->blocks_per_seg -
+ left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
if (dent_blocks > left_blocks)
return false;
@@ -677,21 +702,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
+ unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno);
/* check bitmap with valid block count */
do {
if (is_valid) {
next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
- sbi->blocks_per_seg,
+ usable_blks_per_seg,
cur_pos);
valid_blocks += next_pos - cur_pos;
} else
next_pos = find_next_bit_le(&raw_sit->valid_map,
- sbi->blocks_per_seg,
+ usable_blks_per_seg,
cur_pos);
cur_pos = next_pos;
is_valid = !is_valid;
- } while (cur_pos < sbi->blocks_per_seg);
+ } while (cur_pos < usable_blks_per_seg);
if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
@@ -700,8 +726,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
+ if (usable_blks_per_seg < sbi->blocks_per_seg)
+ f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
+ sbi->blocks_per_seg,
+ usable_blks_per_seg) != sbi->blocks_per_seg);
+
/* check segment usage, and check boundary of a given segment number */
- if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
|| segno > TOTAL_SEGS(sbi) - 1)) {
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index dfa072fa8081..0c958fed3392 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -146,6 +146,7 @@ enum {
Opt_compress_algorithm,
Opt_compress_log_size,
Opt_compress_extension,
+ Opt_atgc,
Opt_err,
};
@@ -213,6 +214,7 @@ static match_table_t f2fs_tokens = {
{Opt_compress_algorithm, "compress_algorithm=%s"},
{Opt_compress_log_size, "compress_log_size=%u"},
{Opt_compress_extension, "compress_extension=%s"},
+ {Opt_atgc, "atgc"},
{Opt_err, NULL},
};
@@ -433,12 +435,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) {
+ if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
err = fscrypt_set_test_dummy_encryption(
- sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx);
+ sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -580,7 +582,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_active_logs:
if (args->from && match_int(args, &arg))
return -EINVAL;
- if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ if (arg != 2 && arg != 4 &&
+ arg != NR_CURSEG_PERSIST_TYPE)
return -EINVAL;
F2FS_OPTION(sbi).active_logs = arg;
break;
@@ -868,8 +871,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
#ifdef CONFIG_F2FS_FS_COMPRESSION
case Opt_compress_algorithm:
if (!f2fs_sb_has_compression(sbi)) {
- f2fs_err(sbi, "Compression feature if off");
- return -EINVAL;
+ f2fs_info(sbi, "Image doesn't support compression");
+ break;
}
name = match_strdup(&args[0]);
if (!name)
@@ -894,8 +897,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
case Opt_compress_log_size:
if (!f2fs_sb_has_compression(sbi)) {
- f2fs_err(sbi, "Compression feature is off");
- return -EINVAL;
+ f2fs_info(sbi, "Image doesn't support compression");
+ break;
}
if (args->from && match_int(args, &arg))
return -EINVAL;
@@ -909,8 +912,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
case Opt_compress_extension:
if (!f2fs_sb_has_compression(sbi)) {
- f2fs_err(sbi, "Compression feature is off");
- return -EINVAL;
+ f2fs_info(sbi, "Image doesn't support compression");
+ break;
}
name = match_strdup(&args[0]);
if (!name)
@@ -938,6 +941,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "compression options not supported");
break;
#endif
+ case Opt_atgc:
+ set_opt(sbi, ATGC);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -964,6 +970,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
#endif
+ /*
+ * The BLKZONED feature indicates that the drive was formatted with
+ * zone alignment optimization. This is optional for host-aware
+ * devices, but mandatory for host-managed zoned block devices.
+ */
+#ifndef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_err(sbi, "Zoned block device support is not enabled");
+ return -EINVAL;
+ }
+#endif
if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
@@ -1001,7 +1018,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
/* Not pass down write hints if the number of active logs is lesser
- * than NR_CURSEG_TYPE.
+ * than NR_CURSEG_PERSIST_TYPE.
*/
if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
@@ -1020,6 +1037,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
+ atomic_set(&fi->i_compr_blocks, 0);
init_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
@@ -1184,6 +1202,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
blkdev_put(FDEV(i).bdev, FMODE_EXCL);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
+ kfree(FDEV(i).zone_capacity_blocks);
#endif
}
kvfree(sbi->devs);
@@ -1269,18 +1288,19 @@ static void f2fs_put_super(struct super_block *sb)
kfree(sbi->raw_super);
destroy_device_list(sbi);
+ f2fs_destroy_page_array_cache(sbi);
f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -1634,13 +1654,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
#ifdef CONFIG_F2FS_FS_COMPRESSION
f2fs_show_compress_options(seq, sbi->sb);
#endif
+
+ if (test_opt(sbi, ATGC))
+ seq_puts(seq, ",atgc");
return 0;
}
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
- F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE;
+ F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
@@ -1763,6 +1786,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
+ bool no_atgc = !test_opt(sbi, ATGC);
bool checkpoint_changed;
#ifdef CONFIG_QUOTA
int i, j;
@@ -1835,6 +1859,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
+ /* disallow enable atgc dynamically */
+ if (no_atgc == !!test_opt(sbi, ATGC)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch atgc option is not allowed");
+ goto restore_opts;
+ }
+
/* disallow enable/disable extent_cache dynamically */
if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
err = -EINVAL;
@@ -2482,10 +2513,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
ctx, len, fs_data, XATTR_CREATE);
}
-static const union fscrypt_context *
-f2fs_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb)
{
- return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx;
+ return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy;
}
static bool f2fs_has_stable_inodes(struct super_block *sb)
@@ -2523,7 +2553,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
- .get_dummy_context = f2fs_get_dummy_context,
+ .get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
@@ -2680,10 +2710,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
}
if (main_end_blkaddr > seg_end_blkaddr) {
- f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
- main_blkaddr,
- segment0_blkaddr +
- (segment_count << log_blocks_per_seg),
+ f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)",
+ main_blkaddr, seg_end_blkaddr,
segment_count_main << log_blocks_per_seg);
return true;
} else if (main_end_blkaddr < seg_end_blkaddr) {
@@ -2701,10 +2729,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
err = __f2fs_commit_super(bh, NULL);
res = err ? "failed" : "done";
}
- f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)",
- res, main_blkaddr,
- segment0_blkaddr +
- (segment_count << log_blocks_per_seg),
+ f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)",
+ res, main_blkaddr, seg_end_blkaddr,
segment_count_main << log_blocks_per_seg);
if (err)
return true;
@@ -2715,7 +2741,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
struct buffer_head *bh)
{
- block_t segment_count, segs_per_sec, secs_per_zone;
+ block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main;
block_t total_sections, blocks_per_seg;
struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
(bh->b_data + F2FS_SUPER_OFFSET);
@@ -2786,6 +2812,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
}
segment_count = le32_to_cpu(raw_super->segment_count);
+ segment_count_main = le32_to_cpu(raw_super->segment_count_main);
segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
total_sections = le32_to_cpu(raw_super->section_count);
@@ -2799,14 +2826,19 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- if (total_sections > segment_count ||
- total_sections < F2FS_MIN_SEGMENTS ||
+ if (total_sections > segment_count_main || total_sections < 1 ||
segs_per_sec > segment_count || !segs_per_sec) {
f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)",
segment_count, total_sections, segs_per_sec);
return -EFSCORRUPTED;
}
+ if (segment_count_main != total_sections * segs_per_sec) {
+ f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)",
+ segment_count_main, total_sections, segs_per_sec);
+ return -EFSCORRUPTED;
+ }
+
if ((segment_count / segs_per_sec) < total_sections) {
f2fs_info(sbi, "Small segment_count (%u < %u * %u)",
segment_count, segs_per_sec, total_sections);
@@ -2832,6 +2864,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
segment_count, dev_seg_count);
return -EFSCORRUPTED;
}
+ } else {
+ if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) &&
+ !bdev_is_zoned(sbi->sb->s_bdev)) {
+ f2fs_info(sbi, "Zoned block device path is missing");
+ return -EFSCORRUPTED;
+ }
}
if (secs_per_zone > total_sections || !secs_per_zone) {
@@ -2907,7 +2945,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
- if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+ if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS ||
ovp_segments == 0 || reserved_segments == 0)) {
f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version");
return 1;
@@ -2995,7 +3033,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
cp_payload = __cp_payload(sbi);
if (cp_pack_start_sum < cp_payload + 1 ||
cp_pack_start_sum > blocks_per_seg - 1 -
- NR_CURSEG_TYPE) {
+ NR_CURSEG_PERSIST_TYPE) {
f2fs_err(sbi, "Wrong cp_pack_start_sum: %u",
cp_pack_start_sum);
return 1;
@@ -3088,13 +3126,26 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_BLK_DEV_ZONED
+
+struct f2fs_report_zones_args {
+ struct f2fs_dev_info *dev;
+ bool zone_cap_mismatch;
+};
+
static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
+ void *data)
{
- struct f2fs_dev_info *dev = data;
+ struct f2fs_report_zones_args *rz_args = data;
+
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+ return 0;
+
+ set_bit(idx, rz_args->dev->blkz_seq);
+ rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >>
+ F2FS_LOG_SECTORS_PER_BLOCK;
+ if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch)
+ rz_args->zone_cap_mismatch = true;
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
- set_bit(idx, dev->blkz_seq);
return 0;
}
@@ -3102,6 +3153,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
{
struct block_device *bdev = FDEV(devi).bdev;
sector_t nr_sectors = bdev->bd_part->nr_sects;
+ struct f2fs_report_zones_args rep_zone_arg;
int ret;
if (!f2fs_sb_has_blkzoned(sbi))
@@ -3127,12 +3179,26 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (!FDEV(devi).blkz_seq)
return -ENOMEM;
- /* Get block zones type */
+ /* Get block zones type and zone-capacity */
+ FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi,
+ FDEV(devi).nr_blkz * sizeof(block_t),
+ GFP_KERNEL);
+ if (!FDEV(devi).zone_capacity_blocks)
+ return -ENOMEM;
+
+ rep_zone_arg.dev = &FDEV(devi);
+ rep_zone_arg.zone_cap_mismatch = false;
+
ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb,
- &FDEV(devi));
+ &rep_zone_arg);
if (ret < 0)
return ret;
+ if (!rep_zone_arg.zone_cap_mismatch) {
+ kfree(FDEV(devi).zone_capacity_blocks);
+ FDEV(devi).zone_capacity_blocks = NULL;
+ }
+
return 0;
}
#endif
@@ -3329,7 +3395,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
{
#ifdef CONFIG_UNICODE
- if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) {
+ if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) {
const struct f2fs_sb_encodings *encoding_info;
struct unicode_map *encoding;
__u16 encoding_flags;
@@ -3360,8 +3426,8 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
"%s-%s with flags 0x%hx", encoding_info->name,
encoding_info->version?:"\b", encoding_flags);
- sbi->s_encoding = encoding;
- sbi->s_encoding_flags = encoding_flags;
+ sbi->sb->s_encoding = encoding;
+ sbi->sb->s_encoding_flags = encoding_flags;
sbi->sb->s_d_op = &f2fs_dentry_ops;
}
#else
@@ -3440,18 +3506,6 @@ try_onemore:
sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
sizeof(raw_super->uuid));
- /*
- * The BLKZONED feature indicates that the drive was formatted with
- * zone alignment optimization. This is optional for host-aware
- * devices, but mandatory for host-managed zoned block devices.
- */
-#ifndef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sbi)) {
- f2fs_err(sbi, "Zoned block device support is not enabled");
- err = -EOPNOTSUPP;
- goto free_sb_buf;
- }
-#endif
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -3566,13 +3620,16 @@ try_onemore:
err = f2fs_init_xattr_caches(sbi);
if (err)
goto free_io_dummy;
+ err = f2fs_init_page_array_cache(sbi);
+ if (err)
+ goto free_xattr_cache;
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_err(sbi, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_xattr_cache;
+ goto free_page_array_cache;
}
err = f2fs_get_valid_checkpoint(sbi);
@@ -3762,6 +3819,8 @@ try_onemore:
}
reset_checkpoint:
+ f2fs_init_inmem_curseg(sbi);
+
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -3846,6 +3905,8 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
sbi->meta_inode = NULL;
+free_page_array_cache:
+ f2fs_destroy_page_array_cache(sbi);
free_xattr_cache:
f2fs_destroy_xattr_caches(sbi);
free_io_dummy:
@@ -3857,14 +3918,14 @@ free_bio_info:
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
free_options:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
kvfree(options);
free_sb_buf:
kfree(raw_super);
@@ -3967,9 +4028,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_create_extent_cache();
if (err)
goto free_checkpoint_caches;
- err = f2fs_init_sysfs();
+ err = f2fs_create_garbage_collection_cache();
if (err)
goto free_extent_cache;
+ err = f2fs_init_sysfs();
+ if (err)
+ goto free_garbage_collection_cache;
err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_sysfs;
@@ -3989,7 +4053,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_compress_mempool();
if (err)
goto free_bioset;
+ err = f2fs_init_compress_cache();
+ if (err)
+ goto free_compress_mempool;
return 0;
+free_compress_mempool:
+ f2fs_destroy_compress_mempool();
free_bioset:
f2fs_destroy_bioset();
free_bio_enrty_cache:
@@ -4003,6 +4072,8 @@ free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_sysfs:
f2fs_exit_sysfs();
+free_garbage_collection_cache:
+ f2fs_destroy_garbage_collection_cache();
free_extent_cache:
f2fs_destroy_extent_cache();
free_checkpoint_caches:
@@ -4019,6 +4090,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_compress_cache();
f2fs_destroy_compress_mempool();
f2fs_destroy_bioset();
f2fs_destroy_bio_entry_cache();
@@ -4027,6 +4099,7 @@ static void __exit exit_f2fs_fs(void)
unregister_filesystem(&f2fs_fs_type);
unregister_shrinker(&f2fs_shrinker_info);
f2fs_exit_sysfs();
+ f2fs_destroy_garbage_collection_cache();
f2fs_destroy_extent_cache();
f2fs_destroy_checkpoint_caches();
f2fs_destroy_segment_manager_caches();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 88ed9969cc86..ec77ccfea923 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -176,12 +176,14 @@ static ssize_t encoding_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
#ifdef CONFIG_UNICODE
+ struct super_block *sb = sbi->sb;
+
if (f2fs_sb_has_casefold(sbi))
return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
- sbi->s_encoding->charset,
- (sbi->s_encoding->version >> 16) & 0xff,
- (sbi->s_encoding->version >> 8) & 0xff,
- sbi->s_encoding->version & 0xff);
+ sb->s_encoding->charset,
+ (sb->s_encoding->version >> 16) & 0xff,
+ (sb->s_encoding->version >> 8) & 0xff,
+ sb->s_encoding->version & 0xff);
#endif
return sprintf(buf, "(none)");
}
@@ -375,12 +377,17 @@ out:
return count;
}
if (!strcmp(a->attr.name, "gc_idle")) {
- if (t == GC_IDLE_CB)
+ if (t == GC_IDLE_CB) {
sbi->gc_mode = GC_IDLE_CB;
- else if (t == GC_IDLE_GREEDY)
+ } else if (t == GC_IDLE_GREEDY) {
sbi->gc_mode = GC_IDLE_GREEDY;
- else
+ } else if (t == GC_IDLE_AT) {
+ if (!sbi->am.atgc_enabled)
+ return -EINVAL;
+ sbi->gc_mode = GC_AT;
+ } else {
sbi->gc_mode = GC_NORMAL;
+ }
return count;
}
@@ -968,4 +975,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
}
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
}
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 9eb0dba851e8..054ec852b5ea 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
@@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- page_cache_readahead_unbounded(inode->i_mapping, NULL,
- index, num_ra_pages, 0);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1b0736ce0918..65afcc3cc68a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -39,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
if (is_inline)
kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
else
- kvfree(xattr_addr);
+ kfree(xattr_addr);
}
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
@@ -425,7 +425,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
*base_addr = txattr_addr;
return 0;
fail:
- kvfree(txattr_addr);
+ kfree(txattr_addr);
return err;
}
@@ -610,7 +610,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
- kvfree(base_addr);
+ kfree(base_addr);
return error;
}
@@ -750,7 +750,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
- kvfree(base_addr);
+ kfree(base_addr);
return error;
}
diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..4559b5fec3bd 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>
+#include <linux/io_uring.h>
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk)
struct files_struct * files = tsk->files;
if (files) {
+ io_uring_files_cancel(files);
task_lock(tsk);
tsk->files = NULL;
task_unlock(tsk);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 149227160ff0..e6005c78bfa9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2184,7 +2184,7 @@ static int __init start_dirtytime_writeback(void)
__initcall(start_dirtytime_writeback);
int dirtytime_interval_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2321,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wb = locked_inode_to_wb_and_lock_list(inode);
- WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
!test_bit(WB_registered, &wb->state),
"bdi-%s not registered\n", bdi_dev_name(wb->bdi));
@@ -2346,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* to make sure background write-back happens
* later.
*/
- if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ if (wakeup_bdi &&
+ (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
wb_wakeup_delayed(wb);
return;
}
@@ -2581,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync)
.range_end = LLONG_MAX,
};
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ab53e42a874a..68b0148f4bb8 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -189,7 +189,7 @@ out:
}
EXPORT_SYMBOL(fs_lookup_param);
-int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
+static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
return inval_plog(log, "Bad value for '%s'", param->key);
}
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 774b2618018a..40ce9a1c12e5 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -8,7 +8,7 @@ config FUSE_FS
There's also a companion library: libfuse2. This library is available
from the FUSE homepage:
- <http://fuse.sourceforge.net/>
+ <https://github.com/libfuse/>
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
@@ -38,3 +38,17 @@ config VIRTIO_FS
If you want to share files between guests or with the host, answer Y
or M.
+
+config FUSE_DAX
+ bool "Virtio Filesystem Direct Host Memory Access support"
+ default y
+ select INTERVAL_TREE
+ depends on VIRTIO_FS
+ depends on FS_DAX
+ depends on DAX_DRIVER
+ help
+ This allows bypassing guest page cache and allows mapping host page
+ cache directly in guest address space.
+
+ If you want to allow mounting a Virtio Filesystem with the "dax"
+ option, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3e8cebfb59b7..8c7021fb2cd4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
-virtiofs-y += virtio_fs.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a1303ad303ba..cc7e94d73c6c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
if (!fc)
goto out;
+ down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
- if (fc->sb) {
+
+ /*
+ * Get any fuse_mount belonging to this fuse_conn; s_bdi is
+ * shared between all of them
+ */
+
+ if (!list_empty(&fc->mounts)) {
+ fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
} else {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
}
spin_unlock(&fc->bg_lock);
+ up_read(&fc->killsb);
fuse_conn_put(fc);
out:
return ret;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 2cc17816d7b1..45082269e698 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -57,6 +57,7 @@
struct cuse_conn {
struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_mount fm; /* Dummy mount referencing fc */
struct fuse_conn fc; /* fuse connection */
struct cdev *cdev; /* associated character device */
struct device *dev; /* device representing @cdev */
@@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file)
* Generic permission check is already done against the chrdev
* file, proceed to open.
*/
- rc = fuse_do_open(&cc->fc, 0, file, 0);
+ rc = fuse_do_open(&cc->fm, 0, file, 0);
if (rc)
fuse_conn_put(&cc->fc);
return rc;
@@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_sync_release(NULL, ff, file->f_flags);
- fuse_conn_put(fc);
+ fuse_conn_put(fm->fc);
return 0;
}
@@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = 0;
if (cc->unrestricted_ioctl)
@@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = FUSE_IOCTL_COMPAT;
if (cc->unrestricted_ioctl)
@@ -313,9 +314,10 @@ struct cuse_init_args {
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
*/
-static void cuse_process_init_reply(struct fuse_conn *fc,
+static void cuse_process_init_reply(struct fuse_mount *fm,
struct fuse_args *args, int error)
{
+ struct fuse_conn *fc = fm->fc;
struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_args_pages *ap = &ia->ap;
struct cuse_conn *cc = fc_to_cc(fc), *pos;
@@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc)
{
int rc;
struct page *page;
- struct fuse_conn *fc = &cc->fc;
+ struct fuse_mount *fm = &cc->fm;
struct cuse_init_args *ia;
struct fuse_args_pages *ap;
@@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc)
ia->desc.length = ap->args.out_args[1].size;
ap->args.end = cuse_process_init_reply;
- rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (rc) {
kfree(ia);
err_free_page:
@@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
* Limit the cuse channel to requests that can
* be represented in file->f_cred->user_ns.
*/
- fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
+ fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+ &fuse_dev_fiq_ops, NULL);
fud = fuse_dev_alloc_install(&cc->fc);
if (!fud) {
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
new file mode 100644
index 000000000000..ff99ab2a3c43
--- /dev/null
+++ b/fs/fuse/dax.c
@@ -0,0 +1,1365 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+ /* Pointer to inode where this memory range is mapped */
+ struct inode *inode;
+
+ /* Will connect in fcd->free_ranges to keep track of free memory */
+ struct list_head list;
+
+ /* For interval tree in file/inode */
+ struct interval_tree_node itn;
+
+ /* Will connect in fc->busy_ranges to keep track busy memory */
+ struct list_head busy_list;
+
+ /** Position in DAX window */
+ u64 window_offset;
+
+ /** Length of mapping, in bytes */
+ loff_t length;
+
+ /* Is this mapping read-only or read-write */
+ bool writable;
+
+ /* reference count when the mapping is used by dax iomap. */
+ refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+ /* Semaphore to protect modifications to the dmap tree */
+ struct rw_semaphore sem;
+
+ /* Sorted rb tree of struct fuse_dax_mapping elements */
+ struct rb_root_cached tree;
+ unsigned long nr;
+};
+
+struct fuse_conn_dax {
+ /* DAX device */
+ struct dax_device *dev;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* List of memory ranges which are busy */
+ unsigned long nr_busy_ranges;
+ struct list_head busy_ranges;
+
+ /* Worker to free up memory ranges */
+ struct delayed_work free_work;
+
+ /* Wait queue for a dax range to become free */
+ wait_queue_head_t range_waitq;
+
+ /* DAX Window Free Ranges */
+ long nr_free_ranges;
+ struct list_head free_ranges;
+
+ unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+ if (!node)
+ return NULL;
+
+ return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+ unsigned long free_threshold;
+
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+ 1);
+ if (fcd->nr_free_ranges < free_threshold)
+ queue_delayed_work(system_long_wq, &fcd->free_work,
+ msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+ unsigned long delay_ms)
+{
+ spin_lock(&fcd->lock);
+ __kick_dmap_free_worker(fcd, delay_ms);
+ spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+ struct fuse_dax_mapping *dmap;
+
+ spin_lock(&fcd->lock);
+ dmap = list_first_entry_or_null(&fcd->free_ranges,
+ struct fuse_dax_mapping, list);
+ if (dmap) {
+ list_del_init(&dmap->list);
+ WARN_ON(fcd->nr_free_ranges <= 0);
+ fcd->nr_free_ranges--;
+ }
+ spin_unlock(&fcd->lock);
+
+ kick_dmap_free_worker(fcd, 0);
+ return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(&dmap->busy_list);
+ WARN_ON(fcd->nr_busy_ranges == 0);
+ fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(&fcd->lock);
+ __dmap_remove_busy_list(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_add_tail(&dmap->list, &fcd->free_ranges);
+ fcd->nr_free_ranges++;
+ wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ /* Return fuse_dax_mapping to free list */
+ spin_lock(&fcd->lock);
+ __dmap_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+ struct fuse_dax_mapping *dmap, bool writable,
+ bool upgrade)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn_dax *fcd = fm->fc->dax;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_setupmapping_in inarg;
+ loff_t offset = start_idx << FUSE_DAX_SHIFT;
+ FUSE_ARGS(args);
+ ssize_t err;
+
+ WARN_ON(fcd->nr_free_ranges < 0);
+
+ /* Ask fuse daemon to setup mapping */
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.foffset = offset;
+ inarg.fh = -1;
+ inarg.moffset = dmap->window_offset;
+ inarg.len = FUSE_DAX_SZ;
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+ if (writable)
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+ args.opcode = FUSE_SETUPMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err < 0)
+ return err;
+ dmap->writable = writable;
+ if (!upgrade) {
+ /*
+ * We don't take a refernce on inode. inode is valid right now
+ * and when inode is going away, cleanup logic should first
+ * cleanup dmap entries.
+ */
+ dmap->inode = inode;
+ dmap->itn.start = dmap->itn.last = start_idx;
+ /* Protected by fi->dax->sem */
+ interval_tree_insert(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr++;
+ spin_lock(&fcd->lock);
+ list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+ fcd->nr_busy_ranges++;
+ spin_unlock(&fcd->lock);
+ }
+ return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+ struct fuse_removemapping_in *inargp,
+ struct fuse_removemapping_one *remove_one)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_REMOVEMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(*inargp);
+ args.in_args[0].value = inargp;
+ args.in_args[1].size = inargp->count * sizeof(*remove_one);
+ args.in_args[1].value = remove_one;
+ return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+ struct list_head *to_remove)
+{
+ struct fuse_removemapping_one *remove_one, *ptr;
+ struct fuse_removemapping_in inarg;
+ struct fuse_dax_mapping *dmap;
+ int ret, i = 0, nr_alloc;
+
+ nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+ remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+ if (!remove_one)
+ return -ENOMEM;
+
+ ptr = remove_one;
+ list_for_each_entry(dmap, to_remove, list) {
+ ptr->moffset = dmap->window_offset;
+ ptr->len = dmap->length;
+ ptr++;
+ i++;
+ num--;
+ if (i >= nr_alloc || num == 0) {
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = i;
+ ret = fuse_send_removemapping(inode, &inarg,
+ remove_one);
+ if (ret)
+ goto out;
+ ptr = remove_one;
+ i = 0;
+ }
+ }
+out:
+ kfree(remove_one);
+ return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+ dmap->itn.start, dmap->itn.last, dmap->window_offset,
+ dmap->length);
+ __dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+ __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ loff_t start, loff_t end)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap, *n;
+ int err, num = 0;
+ LIST_HEAD(to_remove);
+ unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+ unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ while (1) {
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+ end_idx);
+ if (!node)
+ break;
+ dmap = node_to_dmap(node);
+ /* inode is going away. There should not be any users of dmap */
+ WARN_ON(refcount_read(&dmap->refcnt) > 1);
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ num++;
+ list_add(&dmap->list, &to_remove);
+ }
+
+ /* Nothing to remove */
+ if (list_empty(&to_remove))
+ return;
+
+ WARN_ON(fi->dax->nr < num);
+ fi->dax->nr -= num;
+ err = dmap_removemapping_list(inode, num, &to_remove);
+ if (err && err != -ENOTCONN) {
+ pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+ start, end);
+ }
+ spin_lock(&fcd->lock);
+ list_for_each_entry_safe(dmap, n, &to_remove, list) {
+ list_del_init(&dmap->list);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ }
+ spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ struct fuse_removemapping_one forget_one;
+ struct fuse_removemapping_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = 1;
+ memset(&forget_one, 0, sizeof(forget_one));
+ forget_one.moffset = dmap->window_offset;
+ forget_one.len = dmap->length;
+
+ return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * fuse_evict_inode() has already called truncate_inode_pages_final()
+ * before we arrive here. So we should not have to worry about any
+ * pages/exception entries still associated with inode.
+ */
+ inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+ WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap, struct fuse_dax_mapping *dmap,
+ unsigned int flags)
+{
+ loff_t offset, len;
+ loff_t i_size = i_size_read(inode);
+
+ offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+ len = min(length, dmap->length - offset);
+
+ /* If length is beyond end of file, truncate further */
+ if (pos + len > i_size)
+ len = i_size - pos;
+
+ if (len > 0) {
+ iomap->addr = dmap->window_offset + offset;
+ iomap->length = len;
+ if (flags & IOMAP_FAULT)
+ iomap->length = ALIGN(len, PAGE_SIZE);
+ iomap->type = IOMAP_MAPPED;
+ /*
+ * increace refcnt so that reclaim code knows this dmap is in
+ * use. This assumes fi->dax->sem mutex is held either
+ * shared/exclusive.
+ */
+ refcount_inc(&dmap->refcnt);
+
+ /* iomap->private should be NULL */
+ WARN_ON_ONCE(iomap->private);
+ iomap->private = dmap;
+ } else {
+ /* Mapping beyond end of file is hole */
+ fuse_fill_iomap_hole(iomap, length);
+ }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+ int ret;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Can't do inline reclaim in fault path. We call
+ * dax_layout_busy_page() before we free a range. And
+ * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
+ * In fault path we enter with fi->i_mmap_sem held and can't drop
+ * it. Also in fault path we hold fi->i_mmap_sem shared and not
+ * exclusive, so that creates further issues with fuse_wait_dax_page().
+ * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
+ * range to become free and retry.
+ */
+ if (flags & IOMAP_FAULT) {
+ alloc_dmap = alloc_dax_mapping(fcd);
+ if (!alloc_dmap)
+ return -EAGAIN;
+ } else {
+ alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+ if (IS_ERR(alloc_dmap))
+ return PTR_ERR(alloc_dmap);
+ }
+
+ /* If we are here, we should have memory allocated */
+ if (WARN_ON(!alloc_dmap))
+ return -EIO;
+
+ /*
+ * Take write lock so that only one caller can try to setup mapping
+ * and other waits.
+ */
+ down_write(&fi->dax->sem);
+ /*
+ * We dropped lock. Check again if somebody else setup
+ * mapping already.
+ */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return 0;
+ }
+
+ /* Setup one mapping */
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+ writable, false);
+ if (ret < 0) {
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return ret;
+ }
+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+ up_write(&fi->dax->sem);
+ return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ int ret;
+ unsigned long idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Take exclusive lock so that only one caller can try to setup
+ * mapping and others wait.
+ */
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+ /* We are holding either inode lock or i_mmap_sem, and that should
+ * ensure that dmap can't be truncated. We are holding a reference
+ * on dmap and that should make sure it can't be reclaimed. So dmap
+ * should still be there in tree despite the fact we dropped and
+ * re-acquired the fi->dax->sem lock.
+ */
+ ret = -EIO;
+ if (WARN_ON(!node))
+ goto out_err;
+
+ dmap = node_to_dmap(node);
+
+ /* We took an extra reference on dmap to make sure its not reclaimd.
+ * Now we hold fi->dax->sem lock and that reference is not needed
+ * anymore. Drop it.
+ */
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Maybe another thread already upgraded mapping while we were not
+ * holding lock.
+ */
+ if (dmap->writable) {
+ ret = 0;
+ goto out_fill_iomap;
+ }
+
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+ true);
+ if (ret < 0)
+ goto out_err;
+out_fill_iomap:
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+ up_write(&fi->dax->sem);
+ return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_dax_mapping *dmap;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /* We don't support FIEMAP */
+ if (WARN_ON(flags & IOMAP_REPORT))
+ return -EIO;
+
+ iomap->offset = pos;
+ iomap->flags = 0;
+ iomap->bdev = NULL;
+ iomap->dax_dev = fc->dax->dev;
+
+ /*
+ * Both read/write and mmap path can race here. So we need something
+ * to make sure if we are setting up mapping, then other path waits
+ *
+ * For now, use a semaphore for this. It probably needs to be
+ * optimized later.
+ */
+ down_read(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ if (writable && !dmap->writable) {
+ /* Upgrade read-only mapping to read-write. This will
+ * require exclusive fi->dax->sem lock as we don't want
+ * two threads to be trying to this simultaneously
+ * for same dmap. So drop shared lock and acquire
+ * exclusive lock.
+ *
+ * Before dropping fi->dax->sem lock, take reference
+ * on dmap so that its not freed by range reclaim.
+ */
+ refcount_inc(&dmap->refcnt);
+ up_read(&fi->dax->sem);
+ pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ return fuse_upgrade_dax_mapping(inode, pos, length,
+ flags, iomap);
+ } else {
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ up_read(&fi->dax->sem);
+ return 0;
+ }
+ } else {
+ up_read(&fi->dax->sem);
+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ if (pos >= i_size_read(inode))
+ goto iomap_hole;
+
+ return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+ iomap);
+ }
+
+ /*
+ * If read beyond end of file happnes, fs code seems to return
+ * it as hole
+ */
+iomap_hole:
+ fuse_fill_iomap_hole(iomap, length);
+ pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+ __func__, pos, length, iomap->length);
+ return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_dax_mapping *dmap = iomap->private;
+
+ if (dmap) {
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* DAX writes beyond end-of-file aren't handled using iomap, so the
+ * file size is unchanged and there is nothing to do here.
+ */
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+ .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ up_write(&fi->i_mmap_sem);
+ schedule();
+ down_write(&fi->i_mmap_sem);
+}
+
+/* Should be called with fi->i_mmap_sem lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+ loff_t start, loff_t end)
+{
+ struct page *page;
+
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+ u64 dmap_end)
+{
+ bool retry;
+ int ret;
+
+ do {
+ retry = false;
+ ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+ dmap_end);
+ } while (ret == 0 && retry);
+
+ return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+ inode_unlock_shared(inode);
+
+ /* TODO file_accessed(iocb->f_filp) */
+ return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return (iov_iter_rw(from) == WRITE &&
+ ((iocb->ki_pos) >= i_size_read(inode) ||
+ (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t ret;
+
+ ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (ret < 0)
+ return ret;
+
+ fuse_invalidate_attr(inode);
+ fuse_write_update_size(inode, iocb->ki_pos);
+ return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ /* TODO file_update_time() but we don't want metadata I/O */
+
+ /* Do not use dax for file extending writes as write and on
+ * disk i_size increase are not atomic otherwise.
+ */
+ if (file_extending_write(iocb, from))
+ ret = fuse_dax_direct_write(iocb, from);
+ else
+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, bool write)
+{
+ vm_fault_t ret;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ pfn_t pfn;
+ int error = 0;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ bool retry = false;
+
+ if (write)
+ sb_start_pagefault(sb);
+retry:
+ if (retry && !(fcd->nr_free_ranges > 0))
+ wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+ error = 0;
+ retry = true;
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ goto retry;
+ }
+
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (write)
+ sb_end_pagefault(sb);
+
+ return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+ vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+ .fault = fuse_dax_fault,
+ .huge_fault = fuse_dax_huge_fault,
+ .page_mkwrite = fuse_dax_page_mkwrite,
+ .pfn_mkwrite = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &fuse_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+ loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+ if (ret) {
+ pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+ ret, start_pos, end_pos);
+ return ret;
+ }
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ start_pos >> PAGE_SHIFT,
+ end_pos >> PAGE_SHIFT);
+ if (ret)
+ pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+ ret);
+
+ return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * igrab() was done to make sure inode won't go under us, and this
+ * further avoids the race with evict().
+ */
+ ret = dmap_writeback_invalidate(inode, dmap);
+ if (ret)
+ return ret;
+
+ /* Remove dax mapping from inode interval tree now */
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr--;
+
+ /* It is possible that umount/shutdown has killed the fuse connection
+ * and worker thread is trying to reclaim memory in parallel. Don't
+ * warn in that case.
+ */
+ ret = dmap_removemapping_one(inode, dmap);
+ if (ret && ret != -ENOTCONN) {
+ pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+ dmap->window_offset, dmap->length, ret);
+ }
+ return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+ node = interval_tree_iter_next(node, 0, -1)) {
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ continue;
+
+ return dmap;
+ }
+
+ return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+ bool *retry)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ u64 dmap_start, dmap_end;
+ unsigned long start_idx;
+ int ret;
+ struct interval_tree_node *node;
+
+ down_write(&fi->i_mmap_sem);
+
+ /* Lookup a dmap and corresponding file offset to reclaim. */
+ down_read(&fi->dax->sem);
+ dmap = inode_lookup_first_dmap(inode);
+ if (dmap) {
+ start_idx = dmap->itn.start;
+ dmap_start = start_idx << FUSE_DAX_SHIFT;
+ dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+ }
+ up_read(&fi->dax->sem);
+
+ if (!dmap)
+ goto out_mmap_sem;
+ /*
+ * Make sure there are no references to inode pages using
+ * get_user_pages()
+ */
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ dmap = ERR_PTR(ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ /* Range already got reclaimed by somebody else */
+ if (!node) {
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1) {
+ dmap = NULL;
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0) {
+ dmap = ERR_PTR(ret);
+ goto out_write_dmap_sem;
+ }
+
+ /* Clean up dmap. Do not add back to free list */
+ dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+
+ pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+ __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+ struct fuse_dax_mapping *dmap;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ while (1) {
+ bool retry = false;
+
+ dmap = alloc_dax_mapping(fcd);
+ if (dmap)
+ return dmap;
+
+ dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+ /*
+ * Either we got a mapping or it is an error, return in both
+ * the cases.
+ */
+ if (dmap)
+ return dmap;
+
+ /* If we could not reclaim a mapping because it
+ * had a reference or some other temporary failure,
+ * Try again. We want to give up inline reclaim only
+ * if there is no range assigned to this node. Otherwise
+ * if a deadlock is possible if we sleep with fi->i_mmap_sem
+ * held and worker to free memory can't make progress due
+ * to unavailability of fi->i_mmap_sem lock. So sleep
+ * only if fi->dax->nr=0
+ */
+ if (retry)
+ continue;
+ /*
+ * There are no mappings which can be reclaimed. Wait for one.
+ * We are not holding fi->dax->sem. So it is possible
+ * that range gets added now. But as we are not holding
+ * fi->i_mmap_sem, worker should still be able to free up
+ * a range and wake us up.
+ */
+ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+ if (wait_event_killable_exclusive(fcd->range_waitq,
+ (fcd->nr_free_ranges > 0))) {
+ return ERR_PTR(-EINTR);
+ }
+ }
+ }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ /* Find fuse dax mapping at file offset inode. */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+ /* Range already got cleaned up by somebody else */
+ if (!node)
+ return 0;
+ dmap = node_to_dmap(node);
+
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ return 0;
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0)
+ return ret;
+
+ /* Cleanup dmap entry and add back to free list */
+ spin_lock(&fcd->lock);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+ return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take fi->i_mmap_sem to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ * read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx,
+ unsigned long end_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+ loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+ down_write(&fi->i_mmap_sem);
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+ unsigned long nr_to_free)
+{
+ struct fuse_dax_mapping *dmap, *pos, *temp;
+ int ret, nr_freed = 0;
+ unsigned long start_idx = 0, end_idx = 0;
+ struct inode *inode = NULL;
+
+ /* Pick first busy range and free it for now*/
+ while (1) {
+ if (nr_freed >= nr_to_free)
+ break;
+
+ dmap = NULL;
+ spin_lock(&fcd->lock);
+
+ if (!fcd->nr_busy_ranges) {
+ spin_unlock(&fcd->lock);
+ return 0;
+ }
+
+ list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+ busy_list) {
+ /* skip this range if it's in use. */
+ if (refcount_read(&pos->refcnt) > 1)
+ continue;
+
+ inode = igrab(pos->inode);
+ /*
+ * This inode is going away. That will free
+ * up all the ranges anyway, continue to
+ * next range.
+ */
+ if (!inode)
+ continue;
+ /*
+ * Take this element off list and add it tail. If
+ * this element can't be freed, it will help with
+ * selecting new element in next iteration of loop.
+ */
+ dmap = pos;
+ list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+ start_idx = end_idx = dmap->itn.start;
+ break;
+ }
+ spin_unlock(&fcd->lock);
+ if (!dmap)
+ return 0;
+
+ ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+ iput(inode);
+ if (ret)
+ return ret;
+ nr_freed++;
+ }
+ return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+ int ret;
+ struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+ free_work.work);
+ ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+ if (ret) {
+ pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+ ret);
+ }
+
+ /* If number of free ranges are still below threhold, requeue */
+ kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+ struct fuse_dax_mapping *range, *temp;
+
+ /* Free All allocated elements */
+ list_for_each_entry_safe(range, temp, mem_list, list) {
+ list_del(&range->list);
+ if (!list_empty(&range->busy_list))
+ list_del(&range->busy_list);
+ kfree(range);
+ }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+ if (fc->dax) {
+ fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+ kfree(fc->dax);
+ }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+ long nr_pages, nr_ranges;
+ void *kaddr;
+ pfn_t pfn;
+ struct fuse_dax_mapping *range;
+ int ret, id;
+ size_t dax_size = -1;
+ unsigned long i;
+
+ init_waitqueue_head(&fcd->range_waitq);
+ INIT_LIST_HEAD(&fcd->free_ranges);
+ INIT_LIST_HEAD(&fcd->busy_ranges);
+ INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+ id = dax_read_lock();
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
+ &pfn);
+ dax_read_unlock(id);
+ if (nr_pages < 0) {
+ pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+ return nr_pages;
+ }
+
+ nr_ranges = nr_pages/FUSE_DAX_PAGES;
+ pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+ __func__, nr_pages, nr_ranges);
+
+ for (i = 0; i < nr_ranges; i++) {
+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!range)
+ goto out_err;
+
+ /* TODO: This offset only works if virtio-fs driver is not
+ * having some memory hidden at the beginning. This needs
+ * better handling
+ */
+ range->window_offset = i * FUSE_DAX_SZ;
+ range->length = FUSE_DAX_SZ;
+ INIT_LIST_HEAD(&range->busy_list);
+ refcount_set(&range->refcnt, 1);
+ list_add_tail(&range->list, &fcd->free_ranges);
+ }
+
+ fcd->nr_free_ranges = nr_ranges;
+ fcd->nr_ranges = nr_ranges;
+ return 0;
+out_err:
+ /* Free All allocated elements */
+ fuse_free_dax_mem_ranges(&fcd->free_ranges);
+ return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
+{
+ struct fuse_conn_dax *fcd;
+ int err;
+
+ if (!dax_dev)
+ return 0;
+
+ fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+ if (!fcd)
+ return -ENOMEM;
+
+ spin_lock_init(&fcd->lock);
+ fcd->dev = dax_dev;
+ err = fuse_dax_mem_range_init(fcd);
+ if (err) {
+ kfree(fcd);
+ return err;
+ }
+
+ fc->dax = fcd;
+ return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fi->dax = NULL;
+ if (fc->dax) {
+ fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+ if (!fi->dax)
+ return false;
+
+ init_rwsem(&fi->dax->sem);
+ fi->dax->tree = RB_ROOT_CACHED;
+ }
+
+ return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops = {
+ .writepages = fuse_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+};
+
+void fuse_dax_inode_init(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->dax)
+ return;
+
+ inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+ if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+ pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+ map_alignment, FUSE_DAX_SZ);
+ return false;
+ }
+ return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+ struct fuse_conn_dax *fcd = fc->dax;
+
+ if (fcd)
+ cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 02b3c36b3676..588f8d1240aa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
return READ_ONCE(file->private_data);
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
+ req->fm = fm;
}
-static struct fuse_req *fuse_request_alloc(gfp_t flags)
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
{
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req)
- fuse_request_init(req);
+ fuse_request_init(fm, req);
return req;
}
@@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
}
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
int err;
atomic_inc(&fc->num_waiting);
@@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (fc->conn_error)
goto out;
- req = fuse_request_alloc(GFP_KERNEL);
+ req = fuse_request_alloc(fm, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
@@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (unlikely(req->in.h.uid == ((uid_t)-1) ||
req->in.h.gid == ((gid_t)-1))) {
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
return req;
@@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
return ERR_PTR(err);
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
@@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
* the 'end' callback is called if given, else the reference to the
* request is released
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
if (test_and_set_bit(FR_FINISHED, &req->flags))
@@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
}
if (test_bit(FR_ASYNC, &req->flags))
- req->args->end(fc, req->args, req->out.h.error);
+ req->args->end(fm, req->args, req->out.h.error);
put_request:
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
EXPORT_SYMBOL_GPL(fuse_request_end);
-static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_req *req)
{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
@@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
return 0;
}
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void request_wait_answer(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
int err;
@@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
/* matches barrier in fuse_dev_do_read() */
smp_mb__after_atomic();
if (test_bit(FR_SENT, &req->flags))
- queue_interrupt(fiq, req);
+ queue_interrupt(req);
}
if (!test_bit(FR_FORCE, &req->flags)) {
@@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_req *req)
{
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
spin_lock(&fiq->lock);
@@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
__fuse_get_request(req);
queue_request_and_unlock(fiq, req);
- request_wait_answer(fc, req);
+ request_wait_answer(req);
/* Pairs with smp_wmb() in fuse_request_end() */
smp_rmb();
}
@@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
}
}
-static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_force_creds(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
@@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
ssize_t ret;
if (args->force) {
atomic_inc(&fc->num_waiting);
- req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+ req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
if (!args->nocreds)
- fuse_force_creds(fc, req);
+ fuse_force_creds(req);
__set_bit(FR_WAITING, &req->flags);
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
if (!args->noreply)
__set_bit(FR_ISREPLY, &req->flags);
- __fuse_request_send(fc, req);
+ __fuse_request_send(req);
ret = req->out.h.error;
if (!ret && args->out_argvar) {
BUG_ON(args->out_numargs == 0);
ret = args->out_args[args->out_numargs - 1].size;
}
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ret;
}
-static bool fuse_request_queue_background(struct fuse_conn *fc,
- struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
bool queued = false;
WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
@@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
@@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
return queued;
}
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags)
{
struct fuse_req *req;
if (args->force) {
WARN_ON(!args->nocreds);
- req = fuse_request_alloc(gfp_flags);
+ req = fuse_request_alloc(fm, gfp_flags);
if (!req)
return -ENOMEM;
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, true);
+ req = fuse_get_req(fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
fuse_args_to_req(req, args);
- if (!fuse_request_queue_background(fc, req)) {
- fuse_put_request(fc, req);
+ if (!fuse_request_queue_background(req)) {
+ fuse_put_request(req);
return -ENOTCONN;
}
@@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
}
EXPORT_SYMBOL_GPL(fuse_simple_background);
-static int fuse_simple_notify_reply(struct fuse_conn *fc,
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
struct fuse_args *args, u64 unique)
{
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &fm->fc->iq;
int err = 0;
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc,
} else {
err = -ENODEV;
spin_unlock(&fiq->lock);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
return err;
@@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
+ get_page(oldpage);
err = unlock_request(cs->req);
if (err)
- return err;
+ goto out_put_old;
fuse_copy_finish(cs);
err = pipe_buf_confirm(cs->pipe, buf);
if (err)
- return err;
+ goto out_put_old;
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
@@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
if (err) {
unlock_page(newpage);
- return err;
+ goto out_put_old;
}
get_page(newpage);
@@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (err) {
unlock_page(newpage);
put_page(newpage);
- return err;
+ goto out_put_old;
}
unlock_page(oldpage);
+ /* Drop ref for ap->pages[] array */
put_page(oldpage);
cs->len = 0;
- return 0;
+ err = 0;
+out_put_old:
+ /* Drop ref obtained in this function */
+ put_page(oldpage);
+ return err;
out_fallback_unlock:
unlock_page(newpage);
@@ -868,10 +887,10 @@ out_fallback:
cs->offset = buf->offset;
err = lock_request(cs->req);
- if (err)
- return err;
+ if (!err)
+ err = 1;
- return 1;
+ goto out_put_old;
}
static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
@@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
+ get_page(page);
err = unlock_request(cs->req);
- if (err)
+ if (err) {
+ put_page(page);
return err;
+ }
fuse_copy_finish(cs);
buf = cs->pipebufs;
- get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
@@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* SETXATTR is special, since it may contain too large data */
if (args->opcode == FUSE_SETXATTR)
req->out.h.error = -E2BIG;
- fuse_request_end(fc, req);
+ fuse_request_end(req);
goto restart;
}
spin_lock(&fpq->lock);
@@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
- queue_interrupt(fiq, req);
- fuse_put_request(fc, req);
+ queue_interrupt(req);
+ fuse_put_request(req);
return reqsize;
@@ -1293,7 +1314,7 @@ out_end:
if (!test_bit(FR_PRIVATE, &req->flags))
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
return err;
err_unlock:
@@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
fuse_copy_finish(cs);
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb) {
- err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
- outarg.off, outarg.len);
- }
+ err = fuse_reverse_inval_inode(fc, outarg.ino,
+ outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
@@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
- outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (!fc->sb)
- goto out_up_killsb;
-
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
goto out_up_killsb;
@@ -1621,7 +1631,7 @@ struct fuse_retrieve_args {
struct fuse_notify_retrieve_in inarg;
};
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_retrieve_args *ra =
@@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
kfree(ra);
}
-static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
struct fuse_notify_retrieve_out *outarg)
{
int err;
@@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int offset;
size_t total_len = 0;
unsigned int num_pages;
+ struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
struct fuse_args_pages *ap;
@@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
args->in_args[0].value = &ra->inarg;
args->in_args[1].size = total_len;
- err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+ err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
- fuse_retrieve_end(fc, args, err);
+ fuse_retrieve_end(fm, args, err);
return err;
}
@@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_retrieve_out outarg;
+ struct fuse_mount *fm;
struct inode *inode;
+ u64 nodeid;
int err;
err = -EINVAL;
@@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (fc->sb) {
- u64 nodeid = outarg.nodeid;
+ nodeid = outarg.nodeid;
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
- if (inode) {
- err = fuse_retrieve(fc, inode, &outarg);
- iput(inode);
- }
+ inode = fuse_ilookup(fc, nodeid, &fm);
+ if (inode) {
+ err = fuse_retrieve(fm, inode, &outarg);
+ iput(inode);
}
up_read(&fc->killsb);
@@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- err = queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(req);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
goto copy_finish;
}
@@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
out:
return err ? err : nbytes;
@@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct fuse_conn *fc, struct list_head *head)
+static void end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
req->out.h.error = -ECONNABORTED;
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
@@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
} else {
spin_unlock(&fc->lock);
}
@@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 26f028bc760b..ff7dbeb16f88 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
{
struct inode *inode;
struct dentry *parent;
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
@@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (flags & LOOKUP_RCU)
goto out;
- fc = get_fuse_conn(inode);
+ fm = get_fuse_mount(inode);
forget = fuse_alloc_forget();
ret = -ENOMEM;
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+ fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
&entry->d_name, &outarg);
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
if (!ret) {
fi = get_fuse_inode(inode);
- if (outarg.nodeid != get_node_id(inode)) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ if (outarg.nodeid != get_node_id(inode) ||
+ (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+ fuse_queue_forget(fm->fc, forget,
+ outarg.nodeid, 1);
goto invalid;
}
spin_lock(&fi->lock);
@@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry)
return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
}
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+ struct fs_context *fsc;
+ struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb);
+ struct fuse_conn *fc = parent_fm->fc;
+ struct fuse_mount *fm;
+ struct vfsmount *mnt;
+ struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+ struct super_block *sb;
+ int err;
+
+ fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fsc)) {
+ err = PTR_ERR(fsc);
+ goto out;
+ }
+
+ err = -ENOMEM;
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ goto out_put_fsc;
+
+ refcount_set(&fm->count, 1);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ fuse_mount_put(fm);
+ goto out_put_fsc;
+ }
+ fm->fc = fuse_conn_get(fc);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err)
+ goto out_put_sb;
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+ /* We are done configuring the superblock, so unlock it */
+ up_write(&sb->s_umount);
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ /* Create the submount */
+ mnt = vfs_create_mount(fsc);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
+ goto out_put_fsc;
+ }
+ mntget(mnt);
+ put_fs_context(fsc);
+ return mnt;
+
+out_put_sb:
+ /*
+ * Only jump here when fsc->root is NULL and sb is still locked
+ * (otherwise put_fs_context() will put the superblock)
+ */
+ deactivate_locked_super(sb);
+out_put_fsc:
+ put_fs_context(fsc);
+out:
+ return ERR_PTR(err);
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_delete = fuse_dentry_delete,
@@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = {
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
#endif
+ .d_automount = fuse_dentry_automount,
};
const struct dentry_operations fuse_root_dentry_operations = {
@@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
@@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
- fuse_lookup_init(fc, &args, nodeid, name, outarg);
- err = fuse_simple_request(fc, &args);
+ fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fm, &args);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
if (err || !outarg->nodeid)
goto out_put_forget;
@@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
attr_version);
err = -ENOMEM;
if (!*inode) {
- fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
goto out;
}
err = 0;
@@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
{
int err;
struct inode *inode;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
@@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
goto out_put_forget_req;
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
flags &= ~O_NOCTTY;
@@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[0].value = &outentry;
args.out_args[1].size = sizeof(outopen);
args.out_args[1].value = &outopen;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err)
goto out_free_ff;
@@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
- fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
}
@@ -567,7 +644,7 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct inode *dir, struct dentry *entry,
umode_t mode)
{
@@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
args->out_numargs = 1;
args->out_args[0].size = sizeof(outarg);
args->out_args[0].value = &outarg;
- err = fuse_simple_request(fc, args);
+ err = fuse_simple_request(fm, args);
if (err)
goto out_put_forget_req;
@@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
if (!inode) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
return -ENOMEM;
}
kfree(forget);
@@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
dev_t rdev)
{
struct fuse_mknod_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, mode);
+ return create_new_entry(fm, &args, dir, entry, mode);
}
static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
@@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+ return create_new_entry(fm, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct inode *dir, struct dentry *entry,
const char *link)
{
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
unsigned len = strlen(link) + 1;
FUSE_ARGS(args);
@@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
args.in_args[0].value = entry->d_name.name;
args.in_args[1].size = len;
args.in_args[1].value = link;
- return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+ return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
void fuse_update_ctime(struct inode *inode)
@@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode)
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_UNLINK;
@@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
@@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_RMDIR;
@@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
@@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
{
int err;
struct fuse_rename2_in inarg;
- struct fuse_conn *fc = get_fuse_conn(olddir);
+ struct fuse_mount *fm = get_fuse_mount(olddir);
FUSE_ARGS(args);
memset(&inarg, 0, argsize);
@@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
fuse_invalidate_attr(d_inode(oldent));
@@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
int err;
struct fuse_link_in inarg;
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
@@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+ err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
/* Contrary to "normal" filesystems it can happen that link
makes two "logical" inodes point to the same "physical"
inode. We invalidate the attributes of the old one, so it
@@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
if (likely(inode->i_nlink < UINT_MAX))
inc_nlink(inode);
spin_unlock(&fi->lock);
@@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
int err;
struct fuse_getattr_in inarg;
struct fuse_attr_out outarg;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
u64 attr_version;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
(inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file)
STATX_BASIC_STATS & ~STATX_ATIME, 0);
}
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
@@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
struct dentry *dir;
struct dentry *entry;
- parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+ parent = fuse_ilookup(fc, parent_nodeid, NULL);
if (!parent)
return -ENOENT;
@@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc)
static int fuse_access(struct inode *inode, int mask)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_access_in inarg;
int err;
BUG_ON(mask & MAY_NOT_BLOCK);
- if (fc->no_access)
+ if (fm->fc->no_access)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask)
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_access = 1;
+ fm->fc->no_access = 1;
err = 0;
}
return err;
@@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask)
static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
struct fuse_args_pages ap = {
.num_pages = 1,
@@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page)
ap.args.page_zeroing = true;
ap.args.out_numargs = 1;
ap.args.out_args[0].size = desc.length;
- res = fuse_simple_request(fc, &ap.args);
+ res = fuse_simple_request(fm, &ap.args);
fuse_invalidate_atime(inode);
@@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
*/
int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid = FATTR_MTIME;
inarg.mtime = inode->i_mtime.tv_sec;
inarg.mtimensec = inode->i_mtime.tv_nsec;
- if (fc->minor >= 23) {
+ if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
inarg.ctime = inode->i_ctime.tv_sec;
inarg.ctimensec = inode->i_ctime.tv_nsec;
@@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
- fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
/*
@@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct file *file)
{
struct inode *inode = d_inode(dentry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
@@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
loff_t oldsize;
int err;
bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool fault_blocked = false;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
@@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (err)
return err;
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
+ is_truncate = true;
+ }
+
+ if (FUSE_IS_DAX(inode) && is_truncate) {
+ down_write(&fi->i_mmap_sem);
+ fault_blocked = true;
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err) {
+ up_write(&fi->i_mmap_sem);
+ return err;
+ }
+ }
+
if (attr->ia_valid & ATTR_OPEN) {
/* This is coming from open(..., ... | O_TRUNC); */
WARN_ON(!(attr->ia_valid & ATTR_SIZE));
@@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
*/
i_size_write(inode, 0);
truncate_pagecache(inode, 0);
- return 0;
+ goto out;
}
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (WARN_ON(!S_ISREG(inode->i_mode)))
- return -EIO;
- is_truncate = true;
- }
-
/* Flush dirty data/metadata before non-truncate SETATTR */
if (is_wb && S_ISREG(inode->i_mode) &&
attr->ia_valid &
@@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
@@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
+
return 0;
error:
@@ -1621,6 +1714,9 @@ error:
fuse_release_nowrite(inode);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6611ef3269a8..c03034e8c152 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
return pages;
}
-static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
@@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
memset(&inarg, 0, sizeof(inarg));
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
- if (!fc->atomic_o_trunc)
+ if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
args.opcode = opcode;
args.nodeid = nodeid;
@@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
struct fuse_release_args {
@@ -60,7 +60,7 @@ struct fuse_release_args {
struct inode *inode;
};
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
{
struct fuse_file *ff;
@@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
if (unlikely(!ff))
return NULL;
- ff->fc = fc;
+ ff->fm = fm;
ff->release_args = kzalloc(sizeof(*ff->release_args),
GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
@@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- ff->kh = atomic64_inc_return(&fc->khctr);
+ ff->kh = atomic64_inc_return(&fm->fc->khctr);
return ff;
}
@@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
return ff;
}
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
@@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_args *args = &ff->release_args->args;
- if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+ if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
/* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fc, args, 0);
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
- fuse_simple_request(ff->fc, args);
- fuse_release_end(ff->fc, args, 0);
+ fuse_simple_request(ff->fm, args);
+ fuse_release_end(ff->fm, args, 0);
} else {
args->end = fuse_release_end;
- if (fuse_simple_background(ff->fc, args,
+ if (fuse_simple_background(ff->fm, args,
GFP_KERNEL | __GFP_NOFAIL))
- fuse_release_end(ff->fc, args, -ENOTCONN);
+ fuse_release_end(ff->fm, args, -ENOTCONN);
}
kfree(ff);
}
}
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
return -ENOMEM;
@@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
struct fuse_open_out outarg;
int err;
- err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
@@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file)
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
int err;
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
+ bool dax_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc && FUSE_IS_DAX(inode);
err = generic_file_open(inode, file);
if (err)
return err;
- if (is_wb_truncate) {
+ if (is_wb_truncate || dax_truncate) {
inode_lock(inode);
fuse_set_nowrite(inode);
}
- err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+ if (dax_truncate) {
+ down_write(&get_fuse_inode(inode)->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+ err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (is_wb_truncate) {
+out:
+ if (dax_truncate)
+ up_write(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
inode_unlock(inode);
}
@@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
int flags, int opcode)
{
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
/* Inode is NULL on error path of fuse_create_open() */
@@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir)
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
- ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
(fl_owner_t) file);
}
/* Hold inode until release is finished */
@@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir)
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
@@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode)
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
struct fuse_flush_in inarg;
FUSE_ARGS(args);
@@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id)
return err;
err = 0;
- if (fc->no_flush)
+ if (fm->fc->no_flush)
goto inval_attr_out;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
- inarg.lock_owner = fuse_lock_owner_id(fc, id);
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
args.opcode = FUSE_FLUSH;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
@@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
args.in_args[0].value = &inarg;
args.force = true;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_flush = 1;
+ fm->fc->no_flush = 1;
err = 0;
}
@@ -489,7 +503,7 @@ inval_attr_out:
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
- if (!err && fc->writeback_cache)
+ if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr(inode);
return err;
}
@@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
@@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
static int fuse_fsync(struct file *file, loff_t start, loff_t end,
@@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia)
kfree(ia);
}
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
@@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
fuse_io_free(ia);
}
-static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
struct fuse_io_args *ia, size_t num_bytes)
{
ssize_t err;
@@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
- err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
if (err)
- fuse_aio_complete_req(fc, &ia->ap.args, err);
+ fuse_aio_complete_req(fm, &ia->ap.args, err);
return num_bytes;
}
@@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
{
struct file *file = ia->io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
if (owner != NULL) {
ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
- ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
+ ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- return fuse_simple_request(fc, &ia->ap.args);
+ return fuse_simple_request(fm, &ia->ap.args);
}
static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
static int fuse_do_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
loff_t pos = page_offset(page);
struct fuse_page_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
@@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page)
*/
fuse_wait_on_page_writeback(inode, page->index);
- attr_ver = fuse_get_attr_version(fc);
+ attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
- res = fuse_simple_request(fc, &ia.ap.args);
+ res = fuse_simple_request(fm, &ia.ap.args);
if (res < 0)
return res;
/*
@@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page)
return err;
}
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
int i;
@@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
@@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
WARN_ON((loff_t) (pos + count) < 0);
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
- ia->read.attr_ver = fuse_get_attr_version(fc);
- if (fc->async_read) {
+ ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+ if (fm->fc->async_read) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
- err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (!err)
return;
} else {
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
err = res < 0 ? res : 0;
}
- fuse_readpages_end(fc, &ap->args, err);
+ fuse_readpages_end(fm, &ap->args, err);
}
static void fuse_readahead(struct readahead_control *rac)
@@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
args->opcode = FUSE_WRITE;
args->nodeid = ff->nodeid;
args->in_numargs = 2;
- if (ff->fc->minor < 9)
+ if (ff->fm->fc->minor < 9)
args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
args->in_args[0].size = sizeof(ia->write.in);
@@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
struct kiocb *iocb = ia->io->iocb;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_write_in *inarg = &ia->write.in;
ssize_t err;
@@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
inarg->flags = fuse_write_flags(iocb);
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
- inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- err = fuse_simple_request(fc, &ia->ap.args);
+ err = fuse_simple_request(fm, &ia->ap.args);
if (!err && ia->write.out.size > count)
err = -EIO;
@@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_args_pages *ap = &ia->ap;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
int err;
@@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
- err = fuse_simple_request(fc, &ap->args);
+ err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
err = -EIO;
@@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
struct file *file = io->iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
@@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_read_iter(iocb, to);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
@@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_write_iter(iocb, from);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
@@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_conn *fc,
+static void fuse_writepage_finish(struct fuse_mount *fm,
struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
@@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
}
/* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc,
+static void fuse_send_writepage(struct fuse_mount *fm,
struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
@@ -1622,10 +1644,10 @@ __acquires(fi->lock)
args->force = true;
args->nocreds = true;
- err = fuse_simple_background(fc, args, GFP_ATOMIC);
+ err = fuse_simple_background(fm, args, GFP_ATOMIC);
if (err == -ENOMEM) {
spin_unlock(&fi->lock);
- err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+ err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&fi->lock);
}
@@ -1638,7 +1660,7 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
/* After fuse_writepage_finish() aux request list is private */
@@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode)
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t crop = i_size_read(inode);
struct fuse_writepage_args *wpa;
@@ -1671,7 +1693,7 @@ __acquires(fi->lock)
wpa = list_entry(fi->queued_writes.next,
struct fuse_writepage_args, queue_entry);
list_del_init(&wpa->queue_entry);
- fuse_send_writepage(fc, wpa, crop);
+ fuse_send_writepage(fm, wpa, crop);
}
}
@@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
WARN_ON(fuse_insert_writeback(root, wpa));
}
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_writepage_args *wpa =
@@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_writepage_args *next = wpa->next;
@@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
* no invocations of fuse_writepage_end() while we're in
* fuse_set_nowrite..fuse_release_nowrite section.
*/
- fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
@@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
+ /* DAX mmap is superior to direct_io mmap */
+ if (FUSE_IS_DAX(file_inode(file)))
+ return fuse_dax_mmap(file, vma);
+
if (ff->open_flags & FOPEN_DIRECT_IO) {
/* Can't provide the coherency needed for MAP_SHARED */
if (vma->vm_flags & VM_MAYSHARE)
@@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
@@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
- err = convert_fuse_file_lock(fc, &outarg.lk, fl);
+ err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
return err;
}
@@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
- pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
+ pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
/* locking is restartable */
if (err == -EINTR)
@@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
- if (!inode->i_sb->s_bdev || fc->no_bmap)
+ if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS)
- fc->no_bmap = 1;
+ fm->fc->no_bmap = 1;
return err ? 0 : outarg.block;
}
@@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_lseek_in inarg = {
@@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
struct fuse_lseek_out outarg;
int err;
- if (fc->no_lseek)
+ if (fm->fc->no_lseek)
goto fallback;
args.opcode = FUSE_LSEEK;
@@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -ENOSYS) {
- fc->no_lseek = 1;
+ fm->fc->no_lseek = 1;
goto fallback;
}
return err;
@@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_ioctl_in inarg = {
.fh = ff->fh,
.cmd = cmd,
@@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!ap.pages || !iov_page)
goto out;
- fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
/*
* If restricted, initialize IO parameters as encoded in @cmd.
@@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
/* make sure there are enough buffer pages and init request with them */
err = -ENOMEM;
- if (max_pages > fc->max_pages)
+ if (max_pages > fm->fc->max_pages)
goto out;
while (ap.num_pages < max_pages) {
ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_simple_request(fc, &ap.args);
+ transferred = fuse_simple_request(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
@@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
goto out;
vaddr = kmap_atomic(ap.pages[0]);
- err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
kunmap_atomic(vaddr);
@@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iov = iov_page;
out_iov = in_iov + in_iovs;
- err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
if (err)
goto out;
- err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
if (err)
goto out;
@@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
FUSE_ARGS(args);
int err;
- if (fc->no_poll)
+ if (fm->fc->no_poll)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
@@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
*/
if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
- fuse_register_polled_file(fc, ff);
+ fuse_register_polled_file(fm->fc, ff);
}
args.opcode = FUSE_POLL;
@@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
return demangle_poll(outarg.revents);
if (err == -ENOSYS) {
- fc->no_poll = 1;
+ fm->fc->no_poll = 1;
return DEFAULT_POLLMASK;
}
return EPOLLERR;
@@ -3091,11 +3117,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_iter_count(iter);
+ size_t count = iov_iter_count(iter), shortened = 0;
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
@@ -3103,17 +3128,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode = file->f_mapping->host;
i_size = i_size_read(inode);
- if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+ if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
return 0;
- /* optimization for short read */
- if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
- if (offset >= i_size)
- return 0;
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
- count = iov_iter_count(iter);
- }
-
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
if (!io)
return -ENOMEM;
@@ -3129,15 +3146,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = async_dio;
+ io->async = ff->fm->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
+ /* optimization for short read */
+ if (io->async && !io->write && offset + count > i_size) {
+ iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
+ shortened = count - iov_iter_count(iter);
+ count -= shortened;
+ }
+
/*
* We cannot asynchronously extend the size of a file.
* In such case the aio will behave exactly like sync io.
*/
- if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) {
@@ -3155,6 +3179,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
+ iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) {
bool blocking = io->blocking;
@@ -3197,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
@@ -3209,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_PUNCH_HOLE);
+ bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (fc->no_fallocate)
+ if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
if (lock_inode) {
inode_lock(inode);
+ if (block_faults) {
+ down_write(&fi->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
loff_t endbyte = offset + length - 1;
@@ -3241,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_fallocate = 1;
+ fm->fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
if (err)
@@ -3253,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
bool changed = fuse_write_update_size(inode, offset + length);
- if (changed && fc->writeback_cache)
+ if (changed && fm->fc->writeback_cache)
file_update_time(file);
}
@@ -3266,6 +3300,9 @@ out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (block_faults)
+ up_write(&fi->i_mmap_sem);
+
if (lock_inode)
inode_unlock(inode);
@@ -3281,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct fuse_inode *fi_out = get_fuse_inode(inode_out);
- struct fuse_conn *fc = ff_in->fc;
+ struct fuse_mount *fm = ff_in->fm;
+ struct fuse_conn *fc = fm->fc;
FUSE_ARGS(args);
struct fuse_copy_file_range_in inarg = {
.fh_in = ff_in->fh,
@@ -3350,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
@@ -3405,6 +3443,7 @@ static const struct file_operations fuse_file_operations = {
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
+ .get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -3440,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode)
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
fi->writepages = RB_ROOT;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_inode_init(inode);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 740a8a7d7ae6..d51598017d13 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -148,6 +148,20 @@ struct fuse_inode {
/** Lock to protect write related fields */
spinlock_t lock;
+
+ /**
+ * Can't take inode lock in fault path (leads to circular dependency).
+ * Introduce another semaphore which can be taken in fault path and
+ * then other filesystem paths can take this to block faults.
+ */
+ struct rw_semaphore i_mmap_sem;
+
+#ifdef CONFIG_FUSE_DAX
+ /*
+ * Dax specific inode data
+ */
+ struct fuse_inode_dax *dax;
+#endif
};
/** FUSE inode state bits */
@@ -161,12 +175,13 @@ enum {
};
struct fuse_conn;
+struct fuse_mount;
struct fuse_release_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
/* Argument space reserved for release */
struct fuse_release_args *release_args;
@@ -252,7 +267,7 @@ struct fuse_args {
bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
- void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+ void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
};
struct fuse_args_pages {
@@ -360,6 +375,9 @@ struct fuse_req {
/** virtio-fs's physically contiguous buffer for in and out args */
void *argbuf;
#endif
+
+ /** fuse_mount this request belongs to */
+ struct fuse_mount *fm;
};
struct fuse_iqueue;
@@ -482,11 +500,15 @@ struct fuse_fs_context {
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
- bool no_mount_options:1;
+ bool legacy_opts_show:1;
+ bool dax:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
+ /* DAX device, may be NULL */
+ struct dax_device *dax_dev;
+
/* fuse_dev pointer to fill in, should contain NULL on entry */
void **fudptr;
};
@@ -494,9 +516,9 @@ struct fuse_fs_context {
/**
* A Fuse connection.
*
- * This structure is created, when the filesystem is mounted, and is
- * destroyed, when the client device is closed and the filesystem is
- * unmounted.
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
@@ -610,6 +632,9 @@ struct fuse_conn {
/** cache READLINK responses in page cache */
unsigned cache_symlinks:1;
+ /* show legacy mount options */
+ unsigned int legacy_opts_show:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -717,8 +742,8 @@ struct fuse_conn {
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
- /* Do not show mount options */
- unsigned int no_mount_options:1;
+ /* Auto-mount submounts announced by the server */
+ unsigned int auto_submounts:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -726,10 +751,10 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_conn_list */
+ /** Entry on the fuse_mount_list */
struct list_head entry;
- /** Device ID from super block */
+ /** Device ID from the root super block */
dev_t dev;
/** Dentries in the control filesystem */
@@ -747,24 +772,70 @@ struct fuse_conn {
/** Called on final put */
void (*release)(struct fuse_conn *);
- /** Super block for this connection. */
- struct super_block *sb;
-
- /** Read/write semaphore to hold when accessing sb. */
+ /**
+ * Read/write semaphore to hold when accessing the sb of any
+ * fuse_mount belonging to this connection
+ */
struct rw_semaphore killsb;
/** List of device instances belonging to this connection */
struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+ /* Dax specific conn data, non-NULL if DAX is enabled */
+ struct fuse_conn_dax *dax;
+#endif
+
+ /** List of filesystems using this connection */
+ struct list_head mounts;
};
-static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+ /* Underlying (potentially shared) connection to the FUSE server */
+ struct fuse_conn *fc;
+
+ /* Refcount */
+ refcount_t count;
+
+ /*
+ * Super block for this connection (fc->killsb must be held when
+ * accessing this).
+ */
+ struct super_block *sb;
+
+ /* Entry on fc->mounts */
+ struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
}
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+
+ return fm ? fm->fc : NULL;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb);
+}
+
static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
{
- return get_fuse_conn_super(inode->i_sb);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+
+ return fm ? fm->fc : NULL;
}
static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
@@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations;
extern const struct dentry_operations fuse_root_dentry_operations;
/**
- * Inode to nodeid comparison.
- */
-int fuse_inode_eq(struct inode *inode, void *_nodeidp);
-
-/**
* Get a filled in inode
*/
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
*/
int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
@@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
* End a finished request
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
@@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
/**
@@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
*/
void fuse_conn_put(struct fuse_conn *fc);
+/**
+ * Acquire reference to fuse_mount
+ */
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm);
+
+/**
+ * Release reference to fuse_mount
+ */
+void fuse_mount_put(struct fuse_mount *fm);
+
struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_conn *fc);
+void fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc);
*/
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
-/**
- * Disassociate fuse connection from superblock and kill the superblock
+/*
+ * Fill in superblock for submounts
+ * @sb: partially-initialized superblock to fill in
+ * @parent_fi: The fuse_inode of the parent filesystem where this submount is
+ * mounted
+ */
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi);
+
+/*
+ * Remove the mount from the connection
*
- * Calls kill_anon_super(), do not use with bdev mounts.
+ * Returns whether this was the last mount
*/
-void fuse_kill_sb_anon(struct super_block *sb);
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
+ */
+void fuse_conn_destroy(struct fuse_mount *fm);
/**
* Add connection to control filesystem
@@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
/**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result. Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm);
+
+/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len);
/**
@@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
* - is a file or oan empty directory
* then the dentry is unhashed (d_delete()).
*/
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name);
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
/**
@@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
u64 fuse_get_unique(struct fuse_iqueue *fiq);
void fuse_free_conn(struct fuse_conn *fc);
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_cleanup(struct inode *inode);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bba747520e9b..1a47afc95f80 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ init_rwsem(&fi->i_mmap_sem);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
- if (!fi->forget) {
- kmem_cache_free(fuse_inode_cachep, fi);
- return NULL;
- }
+ if (!fi->forget)
+ goto out_free;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+ goto out_free_forget;
return &fi->inode;
+
+out_free_forget:
+ kfree(fi->forget);
+out_free:
+ kmem_cache_free(fuse_inode_cachep, fi);
+ return NULL;
}
static void fuse_free_inode(struct inode *inode)
@@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode)
mutex_destroy(&fi->mutex);
kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+ kfree(fi->dax);
+#endif
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode)
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
- fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
- fi->forget = NULL;
+
+ if (FUSE_IS_DAX(inode))
+ fuse_dax_inode_cleanup(inode);
+ if (fi->nlookup) {
+ fuse_queue_forget(fc, fi->forget, fi->nodeid,
+ fi->nlookup);
+ fi->forget = NULL;
+ }
}
if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
WARN_ON(!list_empty(&fi->write_files));
@@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
BUG();
}
-int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
{
u64 nodeid = *(u64 *) _nodeidp;
if (get_node_id(inode) == nodeid)
@@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
struct fuse_inode *fi;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- retry:
+ /*
+ * Auto mount points get their node id from the submount root, which is
+ * not a unique identifier within this filesystem.
+ *
+ * To avoid conflicts, do not place submount points into the inode hash
+ * table.
+ */
+ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+ S_ISDIR(attr->mode)) {
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ fuse_init_inode(inode, attr);
+ get_fuse_inode(inode)->nodeid = nodeid;
+ inode->i_flags |= S_AUTOMOUNT;
+ goto done;
+ }
+
+retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
return NULL;
@@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
iput(inode);
goto retry;
}
-
+done:
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->nlookup++;
@@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
return inode;
}
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm)
+{
+ struct fuse_mount *fm_iter;
+ struct inode *inode;
+
+ WARN_ON(!rwsem_is_locked(&fc->killsb));
+ list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+ if (!fm_iter->sb)
+ continue;
+
+ inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ if (fm)
+ *fm = fm_iter;
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
- inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
return -ENOENT;
@@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb)
fuse_abort_conn(fc);
}
-static void fuse_send_destroy(struct fuse_conn *fc)
+static void fuse_send_destroy(struct fuse_mount *fm)
{
- if (fc->conn_init) {
+ if (fm->fc->conn_init) {
FUSE_ARGS(args);
args.opcode = FUSE_DESTROY;
args.force = true;
args.nocreds = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
}
}
static void fuse_put_super(struct super_block *sb)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
- mutex_lock(&fuse_mutex);
- list_del(&fc->entry);
- fuse_ctl_remove_conn(fc);
- mutex_unlock(&fuse_mutex);
-
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
}
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_current_process(fc)) {
+ if (!fuse_allow_current_process(fm->fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
@@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
convert_fuse_statfs(buf, &outarg.st);
return err;
@@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (fc->no_mount_options)
- return 0;
+ if (fc->legacy_opts_show) {
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
+ if (fc->default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (fc->allow_other)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ }
+#ifdef CONFIG_FUSE_DAX
+ if (fc->dax)
+ seq_puts(m, ",dax");
+#endif
- seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
- if (fc->default_permissions)
- seq_puts(m, ",default_permissions");
- if (fc->allow_other)
- seq_puts(m, ",allow_other");
- if (fc->max_read != ~0)
- seq_printf(m, ",max_read=%u", fc->max_read);
- if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
- seq_printf(m, ",blksize=%lu", sb->s_blocksize);
return 0;
}
@@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
{
memset(fc, 0, sizeof(*fc));
@@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+
+ INIT_LIST_HEAD(&fc->mounts);
+ list_add(&fm->fc_entry, &fc->mounts);
+ fm->fc = fc;
+ refcount_set(&fm->count, 1);
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
@@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
+void fuse_mount_put(struct fuse_mount *fm)
+{
+ if (refcount_dec_and_test(&fm->count)) {
+ if (fm->fc)
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_mount_put);
+
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm)
+{
+ refcount_inc(&fm->count);
+ return fm;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_get);
+
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
@@ -895,14 +978,16 @@ struct fuse_init_args {
struct fuse_init_out out;
};
-static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
struct fuse_init_out *arg = &ia->out;
+ bool ok = true;
if (error || arg->major != FUSE_KERNEL_VERSION)
- fc->conn_error = 1;
+ ok = false;
else {
unsigned long ra_pages;
@@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
if (arg->flags & FUSE_HANDLE_KILLPRIV)
fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
- fc->sb->s_time_gran = arg->time_gran;
+ fm->sb->s_time_gran = arg->time_gran;
if ((arg->flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
- fc->sb->s_xattr = fuse_acl_xattr_handlers;
+ fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
if (arg->flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
@@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
min_t(unsigned int, FUSE_MAX_MAX_PAGES,
max_t(unsigned int, arg->max_pages, 1));
}
+ if (IS_ENABLED(CONFIG_FUSE_DAX) &&
+ arg->flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
- fc->sb->s_bdi->ra_pages =
- min(fc->sb->s_bdi->ra_pages, ra_pages);
+ fm->sb->s_bdi->ra_pages =
+ min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
}
kfree(ia);
+ if (!ok) {
+ fc->conn_init = 0;
+ fc->conn_error = 1;
+ }
+
fuse_set_initialized(fc);
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_conn *fc)
+void fuse_send_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
@@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc)
ia->in.major = FUSE_KERNEL_VERSION;
ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
- ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+ ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
ia->in.flags |=
FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
@@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc)
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+#ifdef CONFIG_FUSE_DAX
+ if (fm->fc->dax)
+ ia->in.flags |= FUSE_MAP_ALIGNMENT;
+#endif
+ if (fm->fc->auto_submounts)
+ ia->in.flags |= FUSE_SUBMOUNTS;
+
ia->args.opcode = FUSE_INIT;
ia->args.in_numargs = 1;
ia->args.in_args[0].size = sizeof(ia->in);
@@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc)
ia->args.nocreds = true;
ia->args.end = process_init_reply;
- if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fc, &ia->args, -ENOTCONN);
+ if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+ process_init_reply(fm, &ia->args, -ENOTCONN);
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1049,9 +1151,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+ sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud)
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+ const struct fuse_inode *fi)
+{
+ *attr = (struct fuse_attr){
+ .ino = fi->inode.i_ino,
+ .size = fi->inode.i_size,
+ .blocks = fi->inode.i_blocks,
+ .atime = fi->inode.i_atime.tv_sec,
+ .mtime = fi->inode.i_mtime.tv_sec,
+ .ctime = fi->inode.i_ctime.tv_sec,
+ .atimensec = fi->inode.i_atime.tv_nsec,
+ .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .mode = fi->inode.i_mode,
+ .nlink = fi->inode.i_nlink,
+ .uid = fi->inode.i_uid.val,
+ .gid = fi->inode.i_gid.val,
+ .rdev = fi->inode.i_rdev,
+ .blksize = 1u << fi->inode.i_blkbits,
+ };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct super_block *parent_sb = parent_fi->inode.i_sb;
+ struct fuse_attr root_attr;
+ struct inode *root;
+
+ fuse_sb_defaults(sb);
+ fm->sb = sb;
+
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+ sb->s_xattr = parent_sb->s_xattr;
+ sb->s_time_gran = parent_sb->s_time_gran;
+ sb->s_blocksize = parent_sb->s_blocksize;
+ sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+ sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+ if (parent_sb->s_subtype && !sb->s_subtype)
+ return -ENOMEM;
+
+ fuse_fill_attr_from_inode(&root_attr, parent_fi);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ /*
+ * This inode is just a duplicate, so it is not looked up and
+ * its nlookup should not be incremented. fuse_iget() does
+ * that, though, so undo it here.
+ */
+ get_fuse_inode(root)->nlookup--;
+ sb->s_d_op = &fuse_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
struct fuse_dev *fud = NULL;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct inode *root;
struct dentry *root_dentry;
int err;
@@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_flags & SB_MANDLOCK)
goto err;
- sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+ fuse_sb_defaults(sb);
if (ctx->is_bdev) {
#ifdef CONFIG_BLOCK
@@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
sb->s_subtype = ctx->subtype;
ctx->subtype = NULL;
- sb->s_magic = FUSE_SUPER_MAGIC;
- sb->s_op = &fuse_super_operations;
- sb->s_xattr = fuse_xattr_handlers;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_export_op = &fuse_export_operations;
- sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
- if (sb->s_user_ns != &init_user_ns)
- sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
-
- /*
- * If we are not in the initial user namespace posix
- * acls must be translated.
- */
- if (sb->s_user_ns != &init_user_ns)
- sb->s_xattr = fuse_no_acl_xattr_handlers;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
+ if (err)
+ goto err;
+ }
if (ctx->fudptr) {
err = -ENOMEM;
fud = fuse_dev_alloc_install(fc);
if (!fud)
- goto err;
+ goto err_free_dax;
}
fc->dev = sb->s_dev;
- fc->sb = sb;
+ fm->sb = sb;
err = fuse_bdi_init(fc, sb);
if (err)
goto err_dev_free;
@@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->allow_other = ctx->allow_other;
fc->user_id = ctx->user_id;
fc->group_id = ctx->group_id;
- fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+ fc->legacy_opts_show = ctx->legacy_opts_show;
+ fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
- fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
@@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err_dev_free:
if (fud)
fuse_dev_free(fud);
+ err_free_dax:
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
err:
return err;
}
@@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
struct file *file;
int err;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
err = -EINVAL;
file = fget(ctx->fd);
@@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
if (!fc)
goto err_fput;
- fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ goto err_fput;
+ }
+
+ fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
fc->release = fuse_free_conn;
- sb->s_fs_info = fc;
+
+ sb->s_fs_info = fm;
err = fuse_fill_super_common(sb, ctx);
if (err)
@@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
* CPUs after this
*/
fput(file);
- fuse_send_init(get_fuse_conn_super(sb));
+ fuse_send_init(get_fuse_mount_super(sb));
return 0;
err_put_conn:
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
sb->s_fs_info = NULL;
err_fput:
fput(file);
@@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc)
ctx->max_read = ~0;
ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+ ctx->legacy_opts_show = true;
#ifdef CONFIG_BLOCK
if (fc->fs_type == &fuseblk_fs_type) {
@@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc)
return 0;
}
-static void fuse_sb_destroy(struct super_block *sb)
+bool fuse_mount_remove(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ bool last = false;
- if (fc) {
- if (fc->destroy)
- fuse_send_destroy(fc);
+ down_write(&fc->killsb);
+ list_del_init(&fm->fc_entry);
+ if (list_empty(&fc->mounts))
+ last = true;
+ up_write(&fc->killsb);
- fuse_abort_conn(fc);
- fuse_wait_aborted(fc);
+ return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
- down_write(&fc->killsb);
- fc->sb = NULL;
- up_write(&fc->killsb);
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+
+ if (fc->destroy)
+ fuse_send_destroy(fm);
+
+ fuse_abort_conn(fc);
+ fuse_wait_aborted(fc);
+
+ if (!list_empty(&fc->entry)) {
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
-void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_kill_sb_anon(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_anon_super(sb);
}
-EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
@@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse");
#ifdef CONFIG_BLOCK
static void fuse_kill_sb_blk(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_block_super(sb);
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 90e3f01bd796..3b5e91045871 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -252,7 +252,7 @@ retry:
static void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_forget_in inarg;
FUSE_ARGS(args);
@@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid)
args.force = true;
args.noreply = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
/* ignore errors */
}
@@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ssize_t res;
struct page *page;
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
struct fuse_page_desc desc = { .length = PAGE_SIZE };
@@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ap->pages = &page;
ap->descs = &desc;
if (plus) {
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
@@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 104f35de5270..21a9e534417c 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,12 +5,17 @@
*/
#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/uio.h>
#include "fuse_i.h"
/* List of virtio-fs device instances and a lock for the list. Also provides
@@ -24,6 +29,8 @@ enum {
VQ_REQUEST
};
+#define VQ_NAME_LEN 24
+
/* Per-virtqueue state */
struct virtio_fs_vq {
spinlock_t lock;
@@ -36,7 +43,7 @@ struct virtio_fs_vq {
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
- char name[24];
+ char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
@@ -47,6 +54,12 @@ struct virtio_fs {
struct virtio_fs_vq *vqs;
unsigned int nvqs; /* number of virtqueues */
unsigned int num_request_queues; /* number of request queues */
+ struct dax_device *dax_dev;
+
+ /* DAX memory window where file contents are mapped */
+ void *window_kaddr;
+ phys_addr_t window_phys_addr;
+ size_t window_len;
};
struct virtio_fs_forget_req {
@@ -69,6 +82,44 @@ struct virtio_fs_req_work {
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
+enum {
+ OPT_DAX,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+ fsparam_flag("dax", OPT_DAX),
+ {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, virtio_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_DAX:
+ ctx->dax = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void virtio_fs_free_fc(struct fs_context *fc)
+{
+ struct fuse_fs_context *ctx = fc->fs_private;
+
+ kfree(ctx);
+}
+
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
@@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
dispatch_work.work);
- struct fuse_conn *fc = fsvq->fud->fc;
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
list_del_init(&req->list);
spin_unlock(&fsvq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
/* Dispatch pending requests */
@@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
spin_unlock(&fsvq->lock);
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
ret);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
}
@@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
struct fuse_pqueue *fpq = &fsvq->fud->pq;
- struct fuse_conn *fc = fsvq->fud->fc;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
@@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
clear_bit(FR_SENT, &req->flags);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
spin_lock(&fsvq->lock);
dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
@@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
schedule_work(&fsvq->done_work);
}
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+ int vq_type)
+{
+ strncpy(fsvq->name, name, VQ_NAME_LEN);
+ spin_lock_init(&fsvq->lock);
+ INIT_LIST_HEAD(&fsvq->queued_reqs);
+ INIT_LIST_HEAD(&fsvq->end_reqs);
+ init_completion(&fsvq->in_flight_zero);
+
+ if (vq_type == VQ_REQUEST) {
+ INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
+ } else {
+ INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ }
+}
+
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
if (fs->num_request_queues == 0)
return -EINVAL;
- fs->nvqs = 1 + fs->num_request_queues;
+ fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
@@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
goto out;
}
+ /* Initialize the hiprio/forget request virtqueue */
callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
- snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
- "hiprio");
+ virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
- INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
- INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
- virtio_fs_hiprio_dispatch_work);
- init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
- spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
- spin_lock_init(&fs->vqs[i].lock);
- INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_request_dispatch_work);
- INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
- init_completion(&fs->vqs[i].in_flight_zero);
- snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
- "requests.%u", i - VQ_REQUEST);
+ char vq_name[VQ_NAME_LEN];
+
+ snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+ virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
names[i] = fs->vqs[i].name;
}
@@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
vdev->config->del_vqs(vdev);
}
+/* Map a window offset to a page frame number. The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct virtio_fs *fs = dax_get_private(dax_dev);
+ phys_addr_t offset = PFN_PHYS(pgoff);
+ size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+ if (kaddr)
+ *kaddr = fs->window_kaddr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+ PFN_DEV | PFN_MAP);
+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+ .direct_access = virtio_fs_direct_access,
+ .copy_from_iter = virtio_fs_copy_from_iter,
+ .copy_to_iter = virtio_fs_copy_to_iter,
+ .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+ struct dax_device *dax_dev = data;
+
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ struct virtio_shm_region cache_reg;
+ struct dev_pagemap *pgmap;
+ bool have_cache;
+
+ if (!IS_ENABLED(CONFIG_FUSE_DAX))
+ return 0;
+
+ /* Get cache region */
+ have_cache = virtio_get_shm_region(vdev, &cache_reg,
+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+ if (!have_cache) {
+ dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+ return 0;
+ }
+
+ if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+ dev_name(&vdev->dev))) {
+ dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+ cache_reg.addr, cache_reg.len);
+ return -EBUSY;
+ }
+
+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+ cache_reg.addr);
+
+ pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+ /* Ideally we would directly use the PCI BAR resource but
+ * devm_memremap_pages() wants its own copy in pgmap. So
+ * initialize a struct resource from scratch (only the start
+ * and end fields will be used).
+ */
+ pgmap->range = (struct range) {
+ .start = (phys_addr_t) cache_reg.addr,
+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+ };
+ pgmap->nr_range = 1;
+
+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+ if (IS_ERR(fs->window_kaddr))
+ return PTR_ERR(fs->window_kaddr);
+
+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+ fs->window_len = (phys_addr_t) cache_reg.len;
+
+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+ __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+ fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+ if (IS_ERR(fs->dax_dev))
+ return PTR_ERR(fs->dax_dev);
+
+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+ fs->dax_dev);
+}
+
static int virtio_fs_probe(struct virtio_device *vdev)
{
struct virtio_fs *fs;
@@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
/* TODO vq affinity */
+ ret = virtio_fs_setup_dax(vdev, fs);
+ if (ret < 0)
+ goto out_vqs;
+
/* Bring the device online in case the filesystem is mounted and
* requests need to be sent before we return.
*/
@@ -833,18 +1018,37 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ this_len = min(page_descs[i].length, total_len);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
/* Return the number of scatter-gather list elements required */
static unsigned int sg_count_fuse_req(struct fuse_req *req)
{
struct fuse_args *args = req->args;
struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
- unsigned int total_sgs = 1 /* fuse_in_header */;
+ unsigned int size, total_sgs = 1 /* fuse_in_header */;
if (args->in_numargs - args->in_pages)
total_sgs += 1;
- if (args->in_pages)
- total_sgs += ap->num_pages;
+ if (args->in_pages) {
+ size = args->in_args[args->in_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
if (!test_bit(FR_ISREPLY, &req->flags))
return total_sgs;
@@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
if (args->out_numargs - args->out_pages)
total_sgs += 1;
- if (args->out_pages)
- total_sgs += ap->num_pages;
+ if (args->out_pages) {
+ size = args->out_args[args->out_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
return total_sgs;
}
@@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
.release = virtio_fs_fiq_release,
};
-static int virtio_fs_fill_super(struct super_block *sb)
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ ctx->rootmode = S_IFDIR;
+ ctx->default_permissions = 1;
+ ctx->allow_other = 1;
+ ctx->max_read = UINT_MAX;
+ ctx->blksize = 512;
+ ctx->destroy = true;
+ ctx->no_control = true;
+ ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct virtio_fs *fs = fc->iq.priv;
+ struct fuse_fs_context *ctx = fsc->fs_private;
unsigned int i;
int err;
- struct fuse_fs_context ctx = {
- .rootmode = S_IFDIR,
- .default_permissions = 1,
- .allow_other = 1,
- .max_read = UINT_MAX,
- .blksize = 512,
- .destroy = true,
- .no_control = true,
- .no_force_umount = true,
- .no_mount_options = true,
- };
+ virtio_fs_ctx_set_defaults(ctx);
mutex_lock(&virtio_fs_mutex);
/* After holding mutex, make sure virtiofs device is still there.
@@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb)
}
/* virtiofs allocates and installs its own fuse devices */
- ctx.fudptr = NULL;
- err = fuse_fill_super_common(sb, &ctx);
+ ctx->fudptr = NULL;
+ if (ctx->dax)
+ ctx->dax_dev = fs->dax_dev;
+ err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
@@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
/* Previous unmount will stop all queues. Start these again */
virtio_fs_start_all_queues(fs);
- fuse_send_init(fc);
+ fuse_send_init(fm);
mutex_unlock(&virtio_fs_mutex);
return 0;
@@ -1136,18 +1349,17 @@ err:
return err;
}
-static void virtio_kill_sb(struct super_block *sb)
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct virtio_fs *vfs;
- struct virtio_fs_vq *fsvq;
-
- /* If mount failed, we can still be called without any fc */
- if (!fc)
- return fuse_kill_sb_anon(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *vfs = fc->iq.priv;
+ struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
- vfs = fc->iq.priv;
- fsvq = &vfs->vqs[VQ_HIPRIO];
+ /* Stop dax worker. Soon evict_inodes() will be called which
+ * will free all memory ranges belonging to all inodes.
+ */
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_cancel_work(fc);
/* Stop forget queue. Soon destroy will be sent */
spin_lock(&fsvq->lock);
@@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb)
spin_unlock(&fsvq->lock);
virtio_fs_drain_all_queues(vfs);
- fuse_kill_sb_anon(sb);
+ fuse_conn_destroy(fm);
- /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+ /* fuse_conn_destroy() must have sent destroy. Stop all queues
* and drain one more time and free fuse devices. Freeing fuse
* devices will drop their reference on fuse_conn and that in
* turn will drop its reference on virtio_fs object.
@@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb)
virtio_fs_free_devs(vfs);
}
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ /* If mount failed, we can still be called without any fc */
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ virtio_fs_conn_destroy(fm);
+ }
+ kill_anon_super(sb);
+}
+
static int virtio_fs_test_super(struct super_block *sb,
struct fs_context *fsc)
{
- struct fuse_conn *fc = fsc->s_fs_info;
+ struct fuse_mount *fsc_fm = fsc->s_fs_info;
+ struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
- return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+ return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
}
static int virtio_fs_set_super(struct super_block *sb,
@@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb,
err = get_anon_bdev(&sb->s_dev);
if (!err)
- fuse_conn_get(fsc->s_fs_info);
+ fuse_mount_get(fsc->s_fs_info);
return err;
}
@@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
struct virtio_fs *fs;
struct super_block *sb;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
int err;
/* This gets a reference on virtio_fs object. This ptr gets installed
@@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
return -ENOMEM;
}
- fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
- fs);
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm) {
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
+ &virtio_fs_fiq_ops, fs);
fc->release = fuse_free_conn;
fc->delete_stale = true;
+ fc->auto_submounts = true;
- fsc->s_fs_info = fc;
+ fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
- err = virtio_fs_fill_super(sb);
+ err = virtio_fs_fill_super(sb, fsc);
if (err) {
deactivate_locked_super(sb);
return err;
@@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
}
static const struct fs_context_operations virtio_fs_context_ops = {
+ .free = virtio_fs_free_fc,
+ .parse_param = virtio_fs_parse_param,
.get_tree = virtio_fs_get_tree,
};
static int virtio_fs_init_fs_context(struct fs_context *fsc)
{
+ struct fuse_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fsc->fs_private = ctx;
fsc->ops = &virtio_fs_context_ops;
return 0;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 20d052e08b3b..371bdcbc7233 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -14,12 +14,12 @@
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
int err;
- if (fc->no_setxattr)
+ if (fm->fc->no_setxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
args.in_args[1].value = name;
args.in_args[2].size = size;
args.in_args[2].value = value;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_setxattr = 1;
+ fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
@@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (fc->no_getxattr)
+ if (fm->fc->no_getxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
+ fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size)
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_current_process(fc))
+ if (!fuse_allow_current_process(fm->fc))
return -EACCES;
- if (fc->no_listxattr)
+ if (fm->fc->no_listxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
+ fm->fc->no_listxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
int fuse_removexattr(struct inode *inode, const char *name)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
int err;
- if (fc->no_removexattr)
+ if (fm->fc->no_removexattr)
return -EOPNOTSUPP;
args.opcode = FUSE_REMOVEXATTR;
@@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.in_numargs = 1;
args.in_args[0].size = strlen(name) + 1;
args.in_args[0].value = name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_removexattr = 1;
+ fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
diff --git a/fs/inode.c b/fs/inode.c
index 72c4c347afb7..9d78c37b00b8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
+ if (sb->s_type->fs_flags & FS_THP_SUPPORT)
+ __set_bit(AS_THP_SUPPORT, &mapping->flags);
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
diff --git a/fs/internal.h b/fs/internal.h
index 10517ece4516..a7cd0f64faa4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -82,9 +82,6 @@ int may_linkat(struct path *link);
/*
* namespace.c
*/
-extern void *copy_mount_options(const void __user *);
-extern char *copy_mount_string(const void __user *);
-
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 414beb543883..7cb3b4cb9b11 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,8 @@
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
+#include <linux/blk-cgroup.h>
+#include <linux/audit.h>
#include "io-wq.h"
@@ -26,9 +28,8 @@ enum {
IO_WORKER_F_UP = 1, /* up and active */
IO_WORKER_F_RUNNING = 2, /* account as running */
IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_EXITING = 8, /* worker exiting */
- IO_WORKER_F_FIXED = 16, /* static idle worker */
- IO_WORKER_F_BOUND = 32, /* is doing bounded work */
+ IO_WORKER_F_FIXED = 8, /* static idle worker */
+ IO_WORKER_F_BOUND = 16, /* is doing bounded work */
};
enum {
@@ -57,9 +58,13 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
+ struct nsproxy *restore_nsproxy;
struct fs_struct *restore_fs;
};
@@ -87,7 +92,7 @@ enum {
*/
struct io_wqe {
struct {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
@@ -148,11 +153,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
current->files = worker->restore_files;
+ current->nsproxy = worker->restore_nsproxy;
task_unlock(current);
}
@@ -166,7 +172,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
if (worker->mm) {
if (!dropped_lock) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
}
__set_current_state(TASK_RUNNING);
@@ -175,6 +181,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->mm = NULL;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (worker->blkcg_css) {
+ kthread_associate_blkcg(NULL);
+ worker->blkcg_css = NULL;
+ }
+#endif
+
return dropped_lock;
}
@@ -200,7 +213,6 @@ static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
- unsigned nr_workers;
/*
* If we're not at zero, someone else is holding a brief reference
@@ -220,23 +232,19 @@ static void io_worker_exit(struct io_worker *worker)
worker->flags = 0;
preempt_enable();
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
if (__io_worker_unuse(wqe, worker)) {
__release(&wqe->lock);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
}
acct->nr_workers--;
- nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
- spin_unlock_irq(&wqe->lock);
-
- /* all workers gone, wq exit can proceed */
- if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
- complete(&wqe->wq->done);
+ raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
+ if (refcount_dec_and_test(&wqe->wq->refs))
+ complete(&wqe->wq->done);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
@@ -318,6 +326,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
+ worker->restore_nsproxy = current->nsproxy;
worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
@@ -421,14 +430,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
mmput(worker->mm);
worker->mm = NULL;
}
- if (!work->mm)
- return;
- if (mmget_not_zero(work->mm)) {
- kthread_use_mm(work->mm);
- worker->mm = work->mm;
- /* hang on to this mm */
- work->mm = NULL;
+ if (mmget_not_zero(work->identity->mm)) {
+ kthread_use_mm(work->identity->mm);
+ worker->mm = work->identity->mm;
return;
}
@@ -436,12 +441,25 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
work->flags |= IO_WQ_WORK_CANCEL;
}
+static inline void io_wq_switch_blkcg(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (!(work->flags & IO_WQ_WORK_BLKCG))
+ return;
+ if (work->identity->blkcg_css != worker->blkcg_css) {
+ kthread_associate_blkcg(work->identity->blkcg_css);
+ worker->blkcg_css = work->identity->blkcg_css;
+ }
+#endif
+}
+
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
- const struct cred *old_creds = override_creds(work->creds);
+ const struct cred *old_creds = override_creds(work->identity->creds);
- worker->cur_creds = work->creds;
+ worker->cur_creds = work->identity->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
@@ -451,18 +469,26 @@ static void io_wq_switch_creds(struct io_worker *worker,
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
- if (work->files && current->files != work->files) {
+ if ((work->flags & IO_WQ_WORK_FILES) &&
+ current->files != work->identity->files) {
task_lock(current);
- current->files = work->files;
+ current->files = work->identity->files;
+ current->nsproxy = work->identity->nsproxy;
task_unlock(current);
}
- if (work->fs && current->fs != work->fs)
- current->fs = work->fs;
- if (work->mm != worker->mm)
+ if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
+ current->fs = work->identity->fs;
+ if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
io_wq_switch_mm(worker, work);
- if (worker->cur_creds != work->creds)
+ if ((work->flags & IO_WQ_WORK_CREDS) &&
+ worker->cur_creds != work->identity->creds)
io_wq_switch_creds(worker, work);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
+ io_wq_switch_blkcg(worker, work);
+#ifdef CONFIG_AUDIT
+ current->loginuid = work->identity->loginuid;
+ current->sessionid = work->identity->sessionid;
+#endif
}
static void io_assign_current_work(struct io_worker *worker,
@@ -475,6 +501,11 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
+#ifdef CONFIG_AUDIT
+ current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
+ current->sessionid = AUDIT_SID_UNSET;
+#endif
+
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
@@ -504,7 +535,7 @@ get_next:
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
@@ -538,17 +569,17 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
} while (work);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
} while (1);
}
@@ -563,7 +594,7 @@ static int io_wqe_worker(void *data)
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
@@ -574,7 +605,7 @@ loop:
__release(&wqe->lock);
goto loop;
}
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (signal_pending(current))
flush_signals(current);
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
@@ -586,11 +617,11 @@ loop:
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
io_worker_exit(worker);
@@ -630,14 +661,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
io_wqe_dec_running(wqe, worker);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
- struct io_wqe_acct *acct =&wqe->acct[index];
+ struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
@@ -655,8 +686,9 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
kfree(worker);
return false;
}
+ kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
@@ -665,11 +697,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
acct->nr_workers++;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (index == IO_WQ_ACCT_UNBOUND)
atomic_inc(&wq->user->processes);
+ refcount_inc(&wq->refs);
wake_up_process(worker->task);
return true;
}
@@ -685,28 +718,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
return acct->nr_workers < acct->max_workers;
}
+static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+{
+ send_sig(SIGINT, worker->task, 1);
+ return false;
+}
+
+/*
+ * Iterate the passed in list and call the specific function for each
+ * worker that isn't exiting
+ */
+static bool io_wq_for_each_worker(struct io_wqe *wqe,
+ bool (*func)(struct io_worker *, void *),
+ void *data)
+{
+ struct io_worker *worker;
+ bool ret = false;
+
+ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+ if (io_worker_get(worker)) {
+ /* no task if node is/was offline */
+ if (worker->task)
+ ret = func(worker, data);
+ io_worker_release(worker);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+{
+ wake_up_process(worker->task);
+ return false;
+}
+
/*
* Manager thread. Tasked with creating new workers, if we need them.
*/
static int io_wq_manager(void *data)
{
struct io_wq *wq = data;
- int workers_to_create = num_possible_nodes();
int node;
/* create fixed workers */
- refcount_set(&wq->refs, workers_to_create);
+ refcount_set(&wq->refs, 1);
for_each_node(node) {
if (!node_online(node))
continue;
- if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
- goto err;
- workers_to_create--;
+ if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+ continue;
+ set_bit(IO_WQ_BIT_ERROR, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ goto out;
}
- while (workers_to_create--)
- refcount_dec(&wq->refs);
-
complete(&wq->done);
while (!kthread_should_stop()) {
@@ -720,12 +788,12 @@ static int io_wq_manager(void *data)
if (!node_online(node))
continue;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (fork_worker[IO_WQ_ACCT_BOUND])
create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
if (fork_worker[IO_WQ_ACCT_UNBOUND])
@@ -738,12 +806,18 @@ static int io_wq_manager(void *data)
if (current->task_works)
task_work_run();
- return 0;
-err:
- set_bit(IO_WQ_BIT_ERROR, &wq->state);
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- if (refcount_sub_and_test(workers_to_create, &wq->refs))
+out:
+ if (refcount_dec_and_test(&wq->refs)) {
complete(&wq->done);
+ return 0;
+ }
+ /* if ERROR is set and we get here, we have workers to wake */
+ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+ rcu_read_lock();
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
+ rcu_read_unlock();
+ }
return 0;
}
@@ -821,10 +895,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
}
work_flags = work->flags;
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
@@ -850,37 +924,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}
-static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
-{
- send_sig(SIGINT, worker->task, 1);
- return false;
-}
-
-/*
- * Iterate the passed in list and call the specific function for each
- * worker that isn't exiting
- */
-static bool io_wq_for_each_worker(struct io_wqe *wqe,
- bool (*func)(struct io_worker *, void *),
- void *data)
-{
- struct io_worker *worker;
- bool ret = false;
-
- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
- if (io_worker_get(worker)) {
- /* no task if node is/was offline */
- if (worker->task)
- ret = func(worker, data);
- io_worker_release(worker);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
void io_wq_cancel_all(struct io_wq *wq)
{
int node;
@@ -951,13 +994,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
unsigned long flags;
retry:
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
@@ -966,7 +1009,7 @@ retry:
/* not safe to continue after unlock */
goto retry;
}
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1074,7 +1117,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
wqe->wq = wq;
- spin_lock_init(&wqe->lock);
+ raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
@@ -1113,12 +1156,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
return refcount_inc_not_zero(&wq->use_refs);
}
-static bool io_wq_worker_wake(struct io_worker *worker, void *data)
-{
- wake_up_process(worker->task);
- return false;
-}
-
static void __io_wq_destroy(struct io_wq *wq)
{
int node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index ddaf9614cf9b..be21c500c925 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -1,6 +1,8 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
+#include <linux/io_uring.h>
+
struct io_wq;
enum {
@@ -10,6 +12,12 @@ enum {
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
+ IO_WQ_WORK_FILES = 32,
+ IO_WQ_WORK_FS = 64,
+ IO_WQ_WORK_MM = 128,
+ IO_WQ_WORK_CREDS = 256,
+ IO_WQ_WORK_BLKCG = 512,
+
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -85,11 +93,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- struct files_struct *files;
- struct mm_struct *mm;
- const struct cred *creds;
- struct fs_struct *fs;
- unsigned long fsize;
+ struct io_identity *identity;
unsigned flags;
};
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3790c7fe9fee..02dc81622081 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,9 @@
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
+#include <linux/io_uring.h>
+#include <linux/blk-cgroup.h>
+#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -98,6 +101,8 @@
#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
+#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
+ IORING_REGISTER_LAST + IORING_OP_LAST)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
@@ -187,6 +192,7 @@ struct io_mapped_ubuf {
size_t len;
struct bio_vec *bvec;
unsigned int nr_bvecs;
+ unsigned long acct_pages;
};
struct fixed_file_table {
@@ -205,7 +211,7 @@ struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
- struct percpu_ref *cur_refs;
+ struct fixed_file_ref_node *node;
struct percpu_ref refs;
struct completion done;
struct list_head ref_list;
@@ -219,6 +225,27 @@ struct io_buffer {
__u16 bid;
};
+struct io_restriction {
+ DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+ DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+ u8 sqe_flags_allowed;
+ u8 sqe_flags_required;
+ bool registered;
+};
+
+struct io_sq_data {
+ refcount_t refs;
+ struct mutex lock;
+
+ /* ctx's that are using this sqd */
+ struct list_head ctx_list;
+ struct list_head ctx_new_list;
+ struct mutex ctx_lock;
+
+ struct task_struct *thread;
+ struct wait_queue_head wait;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -231,6 +258,7 @@ struct io_ring_ctx {
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
+ unsigned int restricted: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -264,9 +292,25 @@ struct io_ring_ctx {
/* IO offload */
struct io_wq *io_wq;
- struct task_struct *sqo_thread; /* if using sq thread polling */
- struct mm_struct *sqo_mm;
- wait_queue_head_t sqo_wait;
+
+ /*
+ * For SQPOLL usage - we hold a reference to the parent task, so we
+ * have access to the ->files
+ */
+ struct task_struct *sqo_task;
+
+ /* Only used for accounting purposes */
+ struct mm_struct *mm_account;
+
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *sqo_blkcg_css;
+#endif
+
+ struct io_sq_data *sq_data; /* if using sq thread polling */
+
+ struct wait_queue_head sqo_sq_wait;
+ struct wait_queue_entry sqo_wait_entry;
+ struct list_head sqd_list;
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
@@ -275,8 +319,6 @@ struct io_ring_ctx {
*/
struct fixed_file_data *file_data;
unsigned nr_user_files;
- int ring_fd;
- struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
@@ -286,6 +328,11 @@ struct io_ring_ctx {
const struct cred *creds;
+#ifdef CONFIG_AUDIT
+ kuid_t loginuid;
+ unsigned int sessionid;
+#endif
+
struct completion ref_comp;
struct completion sq_thread_comp;
@@ -338,6 +385,7 @@ struct io_ring_ctx {
struct llist_head file_put_llist;
struct work_struct exit_work;
+ struct io_restriction restrictions;
};
/*
@@ -392,13 +440,16 @@ struct io_cancel {
struct io_timeout {
struct file *file;
- u64 addr;
- int flags;
u32 off;
u32 target_seq;
struct list_head list;
};
+struct io_timeout_rem {
+ struct file *file;
+ u64 addr;
+};
+
struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
@@ -514,15 +565,6 @@ struct io_async_rw {
struct wait_page_queue wpq;
};
-struct io_async_ctx {
- union {
- struct io_async_rw rw;
- struct io_async_msghdr msg;
- struct io_async_connect connect;
- struct io_timeout_data timeout;
- };
-};
-
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -538,13 +580,11 @@ enum {
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
- REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -578,8 +618,6 @@ enum {
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* completion under lock */
- REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
@@ -590,8 +628,6 @@ enum {
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
- /* req->task is refcounted */
- REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
@@ -614,6 +650,7 @@ struct io_kiocb {
struct io_sync sync;
struct io_cancel cancel;
struct io_timeout timeout;
+ struct io_timeout_rem timeout_rem;
struct io_connect connect;
struct io_sr_msg sr_msg;
struct io_open open;
@@ -629,7 +666,8 @@ struct io_kiocb {
struct io_completion compl;
};
- struct io_async_ctx *io;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
@@ -697,10 +735,6 @@ struct io_submit_state {
};
struct io_op_def {
- /* needs req->io allocated for deferral/async */
- unsigned async_ctx : 1;
- /* needs current->mm setup, does mm access */
- unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
/* don't fail if file grab fails */
@@ -711,44 +745,51 @@ struct io_op_def {
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
- /* needs file table */
- unsigned file_table : 1;
- /* needs ->fs */
- unsigned needs_fs : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ /* needs rlimit(RLIMIT_FSIZE) assigned */
unsigned needs_fsize : 1;
+ /* must always have async data allocated */
+ unsigned needs_async_data : 1;
+ /* size of async data needed, if any */
+ unsigned short async_size;
+ unsigned work_flags;
};
static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
- .async_ctx = 1,
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
- .async_ctx = 1,
- .needs_mm = 1,
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@@ -756,6 +797,8 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -764,115 +807,123 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
- .async_ctx = 1,
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .needs_fs = 1,
.pollout = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_msghdr),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
- .async_ctx = 1,
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .needs_fs = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_msghdr),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
- .async_ctx = 1,
- .needs_mm = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_timeout_data),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .file_table = 1,
.pollin = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
- .async_ctx = 1,
- .needs_mm = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_timeout_data),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
- .async_ctx = 1,
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_connect),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.needs_fsize = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT] = {
- .file_table = 1,
- .needs_fs = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.needs_file_no_error = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FILES_UPDATE] = {
- .needs_mm = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
},
[IORING_OP_STATX] = {
- .needs_mm = 1,
- .needs_fs = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_MADVISE] = {
- .needs_mm = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_SEND] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
- .file_table = 1,
- .needs_fs = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
+ IO_WQ_WORK_BLKCG,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
@@ -892,21 +943,18 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
-static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_prep_work_files(struct io_kiocb *req);
static void __io_clean_op(struct io_kiocb *req);
-static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
- int fd, struct file **out_file, bool fixed);
-static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe,
- struct io_comp_state *cs);
+static struct file *io_file_get(struct io_submit_state *state,
+ struct io_kiocb *req, int fd, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
static void io_file_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
@@ -933,14 +981,6 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
-static void io_get_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- return;
- get_task_struct(req->task);
- req->flags |= REQ_F_TASK_PINNED;
-}
-
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -948,13 +988,6 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
-static void __io_put_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- put_task_struct(req->task);
-}
-
static void io_sq_thread_drop_mm(void)
{
struct mm_struct *mm = current->mm;
@@ -969,9 +1002,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
if (!current->mm) {
if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !mmget_not_zero(ctx->sqo_mm)))
+ !ctx->sqo_task->mm ||
+ !mmget_not_zero(ctx->sqo_task->mm)))
return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
+ kthread_use_mm(ctx->sqo_task->mm);
}
return 0;
@@ -980,11 +1014,31 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_mm)
+ if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
return 0;
return __io_sq_thread_acquire_mm(ctx);
}
+static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
+ struct cgroup_subsys_state **cur_css)
+
+{
+#ifdef CONFIG_BLK_CGROUP
+ /* puts the old one when swapping */
+ if (*cur_css != ctx->sqo_blkcg_css) {
+ kthread_associate_blkcg(ctx->sqo_blkcg_css);
+ *cur_css = ctx->sqo_blkcg_css;
+ }
+#endif
+}
+
+static void io_sq_thread_unassociate_blkcg(void)
+{
+#ifdef CONFIG_BLK_CGROUP
+ kthread_associate_blkcg(NULL);
+#endif
+}
+
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
@@ -992,16 +1046,48 @@ static inline void req_set_fail_links(struct io_kiocb *req)
}
/*
+ * None of these are dereferenced, they are simply used to check if any of
+ * them have changed. If we're under current and check they are still the
+ * same, we're fine to grab references to them for actual out-of-line use.
+ */
+static void io_init_identity(struct io_identity *id)
+{
+ id->files = current->files;
+ id->mm = current->mm;
+#ifdef CONFIG_BLK_CGROUP
+ rcu_read_lock();
+ id->blkcg_css = blkcg_css();
+ rcu_read_unlock();
+#endif
+ id->creds = current_cred();
+ id->nsproxy = current->nsproxy;
+ id->fs = current->fs;
+ id->fsize = rlimit(RLIMIT_FSIZE);
+#ifdef CONFIG_AUDIT
+ id->loginuid = current->loginuid;
+ id->sessionid = current->sessionid;
+#endif
+ refcount_set(&id->count, 1);
+}
+
+/*
* Note: must call io_req_init_async() for the first time you
* touch any members of io_wq_work.
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = current->io_uring;
+
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
memset(&req->work, 0, sizeof(req->work));
req->flags |= REQ_F_WORK_INITIALIZED;
+
+ /* Grab a ref if this isn't our static identity */
+ req->work.identity = tctx->identity;
+ if (tctx->identity != &tctx->__identity)
+ refcount_inc(&req->work.identity->count);
}
static inline bool io_async_submit(struct io_ring_ctx *ctx)
@@ -1054,7 +1140,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
- init_waitqueue_head(&ctx->sqo_wait);
+ init_waitqueue_head(&ctx->sqo_sq_wait);
+ INIT_LIST_HEAD(&ctx->sqd_list);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
@@ -1106,76 +1193,195 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-/*
- * Returns true if we need to defer file table putting. This can only happen
- * from the error path with REQ_F_COMP_LOCKED set.
- */
-static bool io_req_clean_work(struct io_kiocb *req)
+static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
+{
+ if (req->work.identity == &tctx->__identity)
+ return;
+ if (refcount_dec_and_test(&req->work.identity->count))
+ kfree(req->work.identity);
+}
+
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
- return false;
+ return;
req->flags &= ~REQ_F_WORK_INITIALIZED;
- if (req->work.mm) {
- mmdrop(req->work.mm);
- req->work.mm = NULL;
+ if (req->work.flags & IO_WQ_WORK_MM) {
+ mmdrop(req->work.identity->mm);
+ req->work.flags &= ~IO_WQ_WORK_MM;
}
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ if (req->work.flags & IO_WQ_WORK_BLKCG) {
+ css_put(req->work.identity->blkcg_css);
+ req->work.flags &= ~IO_WQ_WORK_BLKCG;
}
- if (req->work.fs) {
- struct fs_struct *fs = req->work.fs;
-
- if (req->flags & REQ_F_COMP_LOCKED)
- return true;
+#endif
+ if (req->work.flags & IO_WQ_WORK_CREDS) {
+ put_cred(req->work.identity->creds);
+ req->work.flags &= ~IO_WQ_WORK_CREDS;
+ }
+ if (req->work.flags & IO_WQ_WORK_FS) {
+ struct fs_struct *fs = req->work.identity->fs;
- spin_lock(&req->work.fs->lock);
+ spin_lock(&req->work.identity->fs->lock);
if (--fs->users)
fs = NULL;
- spin_unlock(&req->work.fs->lock);
+ spin_unlock(&req->work.identity->fs->lock);
if (fs)
free_fs_struct(fs);
- req->work.fs = NULL;
+ req->work.flags &= ~IO_WQ_WORK_FS;
}
- return false;
+ io_put_identity(req->task->io_uring, req);
+}
+
+/*
+ * Create a private copy of io_identity, since some fields don't match
+ * the current context.
+ */
+static bool io_identity_cow(struct io_kiocb *req)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ const struct cred *creds = NULL;
+ struct io_identity *id;
+
+ if (req->work.flags & IO_WQ_WORK_CREDS)
+ creds = req->work.identity->creds;
+
+ id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
+ if (unlikely(!id)) {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ return false;
+ }
+
+ /*
+ * We can safely just re-init the creds we copied Either the field
+ * matches the current one, or we haven't grabbed it yet. The only
+ * exception is ->creds, through registered personalities, so handle
+ * that one separately.
+ */
+ io_init_identity(id);
+ if (creds)
+ req->work.identity->creds = creds;
+
+ /* add one for this request */
+ refcount_inc(&id->count);
+
+ /* drop old identity, assign new one. one ref for req, one for tctx */
+ if (req->work.identity != tctx->identity &&
+ refcount_sub_and_test(2, &req->work.identity->count))
+ kfree(req->work.identity);
+
+ req->work.identity = id;
+ tctx->identity = id;
+ return true;
+}
+
+static bool io_grab_identity(struct io_kiocb *req)
+{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_identity *id = req->work.identity;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE))
+ return false;
+
+ if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ (def->work_flags & IO_WQ_WORK_FILES) &&
+ !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ if (id->files != current->files ||
+ id->nsproxy != current->nsproxy)
+ return false;
+ atomic_inc(&id->files->count);
+ get_nsproxy(id->nsproxy);
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ req->work.flags |= IO_WQ_WORK_FILES;
+ }
+#ifdef CONFIG_BLK_CGROUP
+ if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
+ (def->work_flags & IO_WQ_WORK_BLKCG)) {
+ rcu_read_lock();
+ if (id->blkcg_css != blkcg_css()) {
+ rcu_read_unlock();
+ return false;
+ }
+ /*
+ * This should be rare, either the cgroup is dying or the task
+ * is moving cgroups. Just punt to root for the handful of ios.
+ */
+ if (css_tryget_online(id->blkcg_css))
+ req->work.flags |= IO_WQ_WORK_BLKCG;
+ rcu_read_unlock();
+ }
+#endif
+ if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
+ if (id->creds != current_cred())
+ return false;
+ get_cred(id->creds);
+ req->work.flags |= IO_WQ_WORK_CREDS;
+ }
+#ifdef CONFIG_AUDIT
+ if (!uid_eq(current->loginuid, id->loginuid) ||
+ current->sessionid != id->sessionid)
+ return false;
+#endif
+ if (!(req->work.flags & IO_WQ_WORK_FS) &&
+ (def->work_flags & IO_WQ_WORK_FS)) {
+ if (current->fs != id->fs)
+ return false;
+ spin_lock(&id->fs->lock);
+ if (!id->fs->in_exec) {
+ id->fs->users++;
+ req->work.flags |= IO_WQ_WORK_FS;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+
+ return true;
}
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_identity *id;
io_req_init_async(req);
+ id = req->work.identity;
if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
- }
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
+
+ /* ->mm can never change on us */
+ if (!(req->work.flags & IO_WQ_WORK_MM) &&
+ (def->work_flags & IO_WQ_WORK_MM)) {
+ mmgrab(id->mm);
+ req->work.flags |= IO_WQ_WORK_MM;
}
- if (def->needs_fsize)
- req->work.fsize = rlimit(RLIMIT_FSIZE);
- else
- req->work.fsize = RLIM_INFINITY;
+
+ /* if we fail grabbing identity, we must COW, regrab, and retry */
+ if (io_grab_identity(req))
+ return;
+
+ if (!io_identity_cow(req))
+ return;
+
+ /* can't fail at this point */
+ if (!io_grab_identity(req))
+ WARN_ON(1);
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1213,27 +1419,49 @@ static void io_queue_async_work(struct io_kiocb *req)
static void io_kill_timeout(struct io_kiocb *req)
{
+ struct io_timeout_data *io = req->async_data;
int ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
}
-static void io_kill_timeouts(struct io_ring_ctx *ctx)
+static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!tsk || req->task == tsk)
+ return true;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (ctx->sq_data && req->task == ctx->sq_data->thread)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Returns true if we found and killed one or more timeouts
+ */
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct io_kiocb *req, *tmp;
+ int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
- io_kill_timeout(req);
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+ if (io_task_match(req, tsk)) {
+ io_kill_timeout(req);
+ canceled++;
+ }
+ }
spin_unlock_irq(&ctx->completion_lock);
+ return canceled != 0;
}
static void __io_queue_deferred(struct io_ring_ctx *ctx)
@@ -1251,8 +1479,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
if (link) {
__io_queue_linked_timeout(link);
/* drop submission reference */
- link->flags |= REQ_F_COMP_LOCKED;
- io_put_req(link);
+ io_put_req_deferred(link, 1);
}
kfree(de);
} while (!list_empty(&ctx->defer_list));
@@ -1284,6 +1511,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_queue_deferred(ctx);
}
+static inline bool io_sqring_full(struct io_ring_ctx *ctx)
+{
+ struct io_rings *r = ctx->rings;
+
+ return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
+}
+
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
@@ -1317,8 +1551,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
@@ -1332,12 +1566,25 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
+static inline bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ if (!files)
+ return true;
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES))
+ return req->work.identity->files == files;
+ return false;
+}
+
/* Returns true if there are no backlogged entries after the flush */
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+ struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_rings *rings = ctx->rings;
+ struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
- struct io_kiocb *req;
unsigned long flags;
LIST_HEAD(list);
@@ -1356,13 +1603,16 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ctx->cq_overflow_flushed = 1;
cqe = NULL;
- while (!list_empty(&ctx->cq_overflow_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
+ if (tsk && req->task != tsk)
+ continue;
+ if (!io_match_files(req, files))
+ continue;
+
cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
- req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- compl.list);
list_move(&req->compl.list, &list);
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
@@ -1406,7 +1656,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed) {
+ } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * then we cannot store the request for later flushing, we need
+ * to drop it on the floor.
+ */
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
} else {
@@ -1452,13 +1707,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
list_del(&req->compl.list);
__io_cqring_fill_event(req, req->result, req->compl.cflags);
- if (!(req->flags & REQ_F_LINK_HEAD)) {
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- } else {
+
+ /*
+ * io_free_req() doesn't care about completion_lock unless one
+ * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
+ * because of a potential deadlock with req->work.fs->lock
+ */
+ if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
+ |REQ_F_WORK_INITIALIZED)) {
spin_unlock_irq(&ctx->completion_lock);
io_put_req(req);
spin_lock_irq(&ctx->completion_lock);
+ } else {
+ io_put_req(req);
}
}
io_commit_cqring(ctx);
@@ -1509,10 +1770,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- struct io_kiocb *req;
-
if (!state->free_reqs) {
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
size_t sz;
int ret;
@@ -1529,14 +1788,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
goto fallback;
ret = 1;
}
- state->free_reqs = ret - 1;
- req = state->reqs[ret - 1];
- } else {
- state->free_reqs--;
- req = state->reqs[state->free_reqs];
+ state->free_reqs = ret;
}
- return req;
+ state->free_reqs--;
+ return state->reqs[state->free_reqs];
fallback:
return io_get_fallback_req(ctx);
}
@@ -1550,23 +1806,30 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static bool io_dismantle_req(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
io_clean_op(req);
- if (req->io)
- kfree(req->io);
+ if (req->async_data)
+ kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- return io_req_clean_work(req);
+ io_req_clean_work(req);
}
-static void __io_free_req_finish(struct io_kiocb *req)
+static void __io_free_req(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- __io_put_req_task(req);
+ io_dismantle_req(req);
+
+ percpu_counter_dec(&tctx->inflight);
+ if (tctx->in_idle)
+ wake_up(&tctx->wait);
+ put_task_struct(req->task);
+
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
@@ -1574,50 +1837,18 @@ static void __io_free_req_finish(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
-static void io_req_task_file_table_put(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct fs_struct *fs = req->work.fs;
-
- spin_lock(&req->work.fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->work.fs->lock);
- if (fs)
- free_fs_struct(fs);
- req->work.fs = NULL;
- __io_free_req_finish(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- if (!io_dismantle_req(req)) {
- __io_free_req_finish(req);
- } else {
- int ret;
-
- init_task_work(&req->task_work, io_req_task_file_table_put);
- ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
- if (unlikely(ret)) {
- struct task_struct *tsk;
-
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
- }
- }
-}
-
static bool io_link_cancel_timeout(struct io_kiocb *req)
{
+ struct io_timeout_data *io = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return true;
}
@@ -1636,7 +1867,6 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
return false;
list_del_init(&link->link_list);
- link->flags |= REQ_F_COMP_LOCKED;
wake_ev = io_link_cancel_timeout(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
return wake_ev;
@@ -1645,17 +1875,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
bool wake_ev;
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- wake_ev = __io_kill_linked_timeout(req);
- }
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (wake_ev)
io_cqring_ev_posted(ctx);
@@ -1695,28 +1920,29 @@ static void __io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
- link->flags |= REQ_F_COMP_LOCKED;
- __io_double_put_req(link);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
+
+ /*
+ * It's ok to free under spinlock as they're not linked anymore,
+ * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
+ * work.fs->lock.
+ */
+ if (link->flags & REQ_F_WORK_INITIALIZED)
+ io_put_req_deferred(link, 2);
+ else
+ io_double_put_req(link);
}
io_commit_cqring(ctx);
- io_cqring_ev_posted(ctx);
}
static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- __io_fail_links(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- __io_fail_links(req);
- }
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
@@ -1746,13 +1972,15 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
- bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
int ret, notify;
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
+
/*
* SQPOLL kernel thread doesn't need notification, just a wakeup. For
* all other cases, use TWA_SIGNAL unconditionally to ensure we're
@@ -1763,7 +1991,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
notify = TWA_SIGNAL;
- ret = task_work_add(tsk, cb, notify);
+ ret = task_work_add(tsk, &req->task_work, notify);
if (!ret)
wake_up_process(tsk);
@@ -1787,8 +2015,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error)
static void io_req_task_cancel(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
__io_req_task_cancel(req, -ECANCELED);
+ percpu_ref_put(&ctx->refs);
}
static void __io_req_task_submit(struct io_kiocb *req)
@@ -1797,7 +2027,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
if (!__io_sq_thread_acquire_mm(ctx)) {
mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL, NULL);
+ __io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
} else {
__io_req_task_cancel(req, -EFAULT);
@@ -1820,7 +2050,7 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, &req->task_work, true);
+ ret = io_req_task_work_add(req, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -1874,6 +2104,9 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
if (rb->to_free)
__io_req_free_batch_flush(ctx, rb);
if (rb->task) {
+ struct io_uring_task *tctx = rb->task->io_uring;
+
+ percpu_counter_sub(&tctx->inflight, rb->task_refs);
put_task_struct_many(rb->task, rb->task_refs);
rb->task = NULL;
}
@@ -1888,18 +2121,19 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
if (req->flags & REQ_F_LINK_HEAD)
io_queue_next(req);
- if (req->flags & REQ_F_TASK_PINNED) {
- if (req->task != rb->task) {
- if (rb->task)
- put_task_struct_many(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
+ if (req->task != rb->task) {
+ if (rb->task) {
+ struct io_uring_task *tctx = rb->task->io_uring;
+
+ percpu_counter_sub(&tctx->inflight, rb->task_refs);
+ put_task_struct_many(rb->task, rb->task_refs);
}
- rb->task_refs++;
- req->flags &= ~REQ_F_TASK_PINNED;
+ rb->task = req->task;
+ rb->task_refs = 0;
}
+ rb->task_refs++;
- WARN_ON_ONCE(io_dismantle_req(req));
+ io_dismantle_req(req);
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
__io_req_free_batch_flush(req->ctx, rb);
@@ -1926,6 +2160,34 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
+static void io_put_req_deferred_cb(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ io_free_req(req);
+}
+
+static void io_free_req_deferred(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_put_req_deferred_cb);
+ ret = io_req_task_work_add(req, true);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+}
+
+static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
+{
+ if (refcount_sub_and_test(refs, &req->refs))
+ io_free_req_deferred(req);
+}
+
static struct io_wq_work *io_steal_work(struct io_kiocb *req)
{
struct io_kiocb *nxt;
@@ -1942,17 +2204,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
return nxt ? &nxt->work : NULL;
}
-/*
- * Must only be used if we don't need to care about links, usually from
- * within the completion handling itself.
- */
-static void __io_double_put_req(struct io_kiocb *req)
-{
- /* drop both submit and complete references */
- if (refcount_sub_and_test(2, &req->refs))
- __io_free_req(req);
-}
-
static void io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
@@ -1973,7 +2224,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U;
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
}
/* See comment at the top of this file */
@@ -2010,6 +2261,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
+ /*
+ * Not safe to run on exiting task, and the task_work handling will
+ * not add work to such a task.
+ */
+ if (unlikely(current->flags & PF_EXITING))
+ return false;
if (current->task_works) {
__set_current_state(TASK_RUNNING);
task_work_run();
@@ -2283,13 +2540,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
goto end_req;
}
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- goto end_req;
- ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
- if (!ret)
+ if (!req->async_data) {
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ goto end_req;
+ ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+ if (!ret)
+ return true;
+ kfree(iovec);
+ } else {
return true;
- kfree(iovec);
+ }
end_req:
req_set_fail_links(req);
io_req_complete(req, ret);
@@ -2386,8 +2647,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
}
static void __io_state_file_put(struct io_submit_state *state)
@@ -2416,7 +2677,6 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (state->file) {
if (state->fd == fd) {
state->has_refs--;
- state->ios_left--;
return state->file;
}
__io_state_file_put(state);
@@ -2426,15 +2686,14 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
return NULL;
state->fd = fd;
- state->ios_left--;
- state->has_refs = state->ios_left;
+ state->has_refs = state->ios_left - 1;
return state->file;
}
static bool io_bdev_nowait(struct block_device *bdev)
{
#ifdef CONFIG_BLOCK
- return !bdev || queue_is_mq(bdev_get_queue(bdev));
+ return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
#else
return true;
#endif
@@ -2476,8 +2735,7 @@ static bool io_file_supports_async(struct file *file, int rw)
return file->f_op->write_iter != NULL;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2512,12 +2770,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
- if (kiocb->ki_flags & IOCB_DIRECT)
- io_get_req_task(req);
-
- if (force_nonblock)
- kiocb->ki_flags |= IOCB_NOWAIT;
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
@@ -2526,7 +2778,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
- io_get_req_task(req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2564,13 +2815,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+ struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (req->io && req->io->rw.bytes_done > 0) {
+ if (io && io->bytes_done > 0) {
if (ret < 0)
- ret = req->io->rw.bytes_done;
+ ret = io->bytes_done;
else
- ret += req->io->rw.bytes_done;
+ ret += io->bytes_done;
}
if (req->flags & REQ_F_CUR_POS)
@@ -2587,18 +2839,12 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
- u16 index, buf_index;
+ u16 index, buf_index = req->buf_index;
size_t offset;
u64 buf_addr;
- /* attempt to use fixed buffers without having provided iovecs */
- if (unlikely(!ctx->user_bufs))
- return -EFAULT;
-
- buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
-
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = &ctx->user_bufs[index];
buf_addr = req->rw.addr;
@@ -2837,28 +3083,25 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
return ret;
}
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
- iovec, iter);
-#endif
-
- return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+ return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ req->ctx->compat);
}
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
- if (!req->io)
+ struct io_async_rw *iorw = req->async_data;
+
+ if (!iorw)
return __io_import_iovec(rw, req, iovec, iter, needs_lock);
*iovec = NULL;
- return iov_iter_count(&req->io->rw.iter);
+ return iov_iter_count(&iorw->iter);
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
- return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos;
+ return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
}
/*
@@ -2922,10 +3165,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
const struct iovec *fast_iov, struct iov_iter *iter)
{
- struct io_async_rw *rw = &req->io->rw;
+ struct io_async_rw *rw = req->async_data;
memcpy(&rw->iter, iter, sizeof(*iter));
- rw->free_iovec = NULL;
+ rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
if (iter->type == ITER_BVEC)
@@ -2942,33 +3185,33 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
- rw->free_iovec = iovec;
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int __io_alloc_async_ctx(struct io_kiocb *req)
+static inline int __io_alloc_async_data(struct io_kiocb *req)
{
- req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
- return req->io == NULL;
+ WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
+ req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
+ return req->async_data == NULL;
}
-static int io_alloc_async_ctx(struct io_kiocb *req)
+static int io_alloc_async_data(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].async_ctx)
+ if (!io_op_defs[req->opcode].needs_async_data)
return 0;
- return __io_alloc_async_ctx(req);
+ return __io_alloc_async_data(req);
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
const struct iovec *fast_iov,
struct iov_iter *iter, bool force)
{
- if (!force && !io_op_defs[req->opcode].async_ctx)
+ if (!force && !io_op_defs[req->opcode].needs_async_data)
return 0;
- if (!req->io) {
- if (__io_alloc_async_ctx(req))
+ if (!req->async_data) {
+ if (__io_alloc_async_data(req))
return -ENOMEM;
io_req_map_rw(req, iovec, fast_iov, iter);
@@ -2976,29 +3219,28 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return 0;
}
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
- bool force_nonblock)
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
- struct io_async_rw *iorw = &req->io->rw;
- struct iovec *iov;
+ struct io_async_rw *iorw = req->async_data;
+ struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- iorw->iter.iov = iov = iorw->fast_iov;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock);
+ ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
- iorw->iter.iov = iov;
- io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
+ iorw->bytes_done = 0;
+ iorw->free_iovec = iov;
+ if (iov)
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
ssize_t ret;
- ret = io_prep_rw(req, sqe, force_nonblock);
+ ret = io_prep_rw(req, sqe);
if (ret)
return ret;
@@ -3006,9 +3248,9 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EBADF;
/* either don't need iovec imported or already have it */
- if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
+ if (!req->async_data)
return 0;
- return io_rw_prep_async(req, READ, force_nonblock);
+ return io_rw_prep_async(req, READ);
}
/*
@@ -3034,6 +3276,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
if (!wake_page_match(wpq, key))
return 0;
+ req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
list_del_init(&wait->entry);
init_task_work(&req->task_work, io_req_task_submit);
@@ -3041,7 +3284,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, &req->task_work, true);
+ ret = io_req_task_work_add(req, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -3068,7 +3311,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
*/
static bool io_rw_should_retry(struct io_kiocb *req)
{
- struct wait_page_queue *wait = &req->io->rw.wpq;
+ struct io_async_rw *rw = req->async_data;
+ struct wait_page_queue *wait = &rw->wpq;
struct kiocb *kiocb = &req->rw.kiocb;
/* never retry for NOWAIT, we just complete with -EAGAIN */
@@ -3091,9 +3335,8 @@ static bool io_rw_should_retry(struct io_kiocb *req)
wait->wait.flags = 0;
INIT_LIST_HEAD(&wait->wait.entry);
kiocb->ki_flags |= IOCB_WAITQ;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
kiocb->ki_waitq = wait;
-
- io_get_req_task(req);
return true;
}
@@ -3113,11 +3356,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
+ struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
size_t iov_count;
+ bool no_async;
- if (req->io)
- iter = &req->io->rw.iter;
+ if (rw)
+ iter = &rw->iter;
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
@@ -3130,9 +3375,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
kiocb->ki_flags &= ~IOCB_NOWAIT;
+ else
+ kiocb->ki_flags |= IOCB_NOWAIT;
+
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req->file, READ))
+ no_async = force_nonblock && !io_file_supports_async(req->file, READ);
+ if (no_async)
goto copy_iov;
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
@@ -3155,10 +3404,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
goto done;
/* some cases will consume bytes even on error returns */
iov_iter_revert(iter, iov_count - iov_iter_count(iter));
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
- if (ret)
- goto out_free;
- return -EAGAIN;
+ ret = 0;
+ goto copy_iov;
} else if (ret < 0) {
/* make sure -ERESTARTSYS -> -EINTR is done */
goto done;
@@ -3176,12 +3423,15 @@ copy_iov:
ret = ret2;
goto out_free;
}
+ if (no_async)
+ return -EAGAIN;
+ rw = req->async_data;
/* it's copied and will be cleaned with ->io */
iovec = NULL;
/* now use our persistent iterator, if we aren't already */
- iter = &req->io->rw.iter;
+ iter = &rw->iter;
retry:
- req->io->rw.bytes_done += ret;
+ rw->bytes_done += ret;
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
@@ -3212,12 +3462,11 @@ out_free:
return ret;
}
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
ssize_t ret;
- ret = io_prep_rw(req, sqe, force_nonblock);
+ ret = io_prep_rw(req, sqe);
if (ret)
return ret;
@@ -3225,9 +3474,9 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EBADF;
/* either don't need iovec imported or already have it */
- if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
+ if (!req->async_data)
return 0;
- return io_rw_prep_async(req, WRITE, force_nonblock);
+ return io_rw_prep_async(req, WRITE);
}
static int io_write(struct io_kiocb *req, bool force_nonblock,
@@ -3236,11 +3485,12 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
+ struct io_async_rw *rw = req->async_data;
size_t iov_count;
ssize_t ret, ret2, io_size;
- if (req->io)
- iter = &req->io->rw.iter;
+ if (rw)
+ iter = &rw->iter;
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
@@ -3251,7 +3501,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
- req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ else
+ kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, WRITE))
@@ -3323,10 +3575,7 @@ static int __io_splice_prep(struct io_kiocb *req,
{
struct io_splice* sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- int ret;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3337,10 +3586,10 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
- ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (ret)
- return ret;
+ sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+ (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (!sp->file_in)
+ return -EBADF;
req->flags |= REQ_F_NEED_CLEANUP;
if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
@@ -3508,8 +3757,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
- return -EINVAL;
if (unlikely(sqe->ioprio || sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3536,8 +3783,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
req->open.how = build_open_how(flags, mode);
@@ -3550,8 +3797,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
@@ -3767,7 +4014,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -3834,7 +4081,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
if (force_nonblock)
return -EAGAIN;
- ret = do_madvise(ma->addr, ma->len, ma->advice);
+ ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_req_complete(req, ret);
@@ -3882,7 +4129,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
@@ -3938,8 +4185,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if ((req->file && req->file->f_op == &io_uring_fops) ||
- req->close.fd == req->ctx->ring_fd)
+ if ((req->file && req->file->f_op == &io_uring_fops))
return -EBADF;
req->close.put_file = NULL;
@@ -3969,7 +4215,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
}
/* No ->flush() or already async, safely close from here */
- ret = filp_close(close->put_file, req->work.files);
+ ret = filp_close(close->put_file, req->work.identity->files);
if (ret < 0)
req_set_fail_links(req);
fput(close->put_file);
@@ -4016,15 +4262,18 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
- if (req->io)
+ struct io_async_msghdr *async_msg = req->async_data;
+
+ if (async_msg)
return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
+ if (io_alloc_async_data(req)) {
if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
return -ENOMEM;
}
+ async_msg = req->async_data;
req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
+ memcpy(async_msg, kmsg, sizeof(*kmsg));
return -EAGAIN;
}
@@ -4039,8 +4288,8 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- struct io_async_ctx *io = req->io;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -4055,13 +4304,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- if (!io || req->opcode == IORING_OP_SEND)
+ if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
return 0;
- /* iovec is already imported */
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
-
- ret = io_sendmsg_copy_hdr(req, &io->msg);
+ ret = io_sendmsg_copy_hdr(req, async_msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -4079,9 +4324,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
if (unlikely(!sock))
return ret;
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
+ if (req->async_data) {
+ kmsg = req->async_data;
+ kmsg->msg.msg_name = &kmsg->addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -4130,7 +4375,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
- return ret;;
+ return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
@@ -4179,8 +4424,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len);
iomsg->iov = NULL;
} else {
- ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->iov, &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+ &iomsg->iov, &iomsg->msg.msg_iter,
+ false);
if (ret > 0)
ret = 0;
}
@@ -4220,9 +4466,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len = iomsg->iov[0].iov_len;
iomsg->iov = NULL;
} else {
- ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
- &iomsg->iov,
- &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+ UIO_FASTIOV, &iomsg->iov,
+ &iomsg->msg.msg_iter, true);
if (ret < 0)
return ret;
}
@@ -4268,8 +4514,8 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- struct io_async_ctx *io = req->io;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -4285,13 +4531,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- if (!io || req->opcode == IORING_OP_RECV)
- return 0;
- /* iovec is already imported */
- if (req->flags & REQ_F_NEED_CLEANUP)
+ if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
return 0;
-
- ret = io_recvmsg_copy_hdr(req, &io->msg);
+ ret = io_recvmsg_copy_hdr(req, async_msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -4310,9 +4552,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
if (unlikely(!sock))
return ret;
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
+ if (req->async_data) {
+ kmsg = req->async_data;
+ kmsg->msg.msg_name = &kmsg->addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -4454,7 +4696,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock,
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- struct io_async_ctx *io = req->io;
+ struct io_async_connect *io = req->async_data;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
@@ -4468,22 +4710,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
return move_addr_to_kernel(conn->addr, conn->addr_len,
- &io->connect.address);
+ &io->address);
}
static int io_connect(struct io_kiocb *req, bool force_nonblock,
struct io_comp_state *cs)
{
- struct io_async_ctx __io, *io;
+ struct io_async_connect __io, *io;
unsigned file_flags;
int ret;
- if (req->io) {
- io = req->io;
+ if (req->async_data) {
+ io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
req->connect.addr_len,
- &__io.connect.address);
+ &__io.address);
if (ret)
goto out;
io = &__io;
@@ -4491,16 +4733,17 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock,
file_flags = force_nonblock ? O_NONBLOCK : 0;
- ret = __sys_connect_file(req->file, &io->connect.address,
+ ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->io)
+ if (req->async_data)
return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
+ if (io_alloc_async_data(req)) {
ret = -ENOMEM;
goto out;
}
- memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
+ io = req->async_data;
+ memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
@@ -4608,7 +4851,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
+ ret = io_req_task_work_add(req, twa_signal_ok);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -4642,9 +4885,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
{
- /* pure poll stashes this in ->io, poll driven retry elsewhere */
+ /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
if (req->opcode == IORING_OP_POLL_ADD)
- return (struct io_poll_iocb *) req->io;
+ return req->async_data;
return req->apoll->double_poll;
}
@@ -4694,10 +4937,9 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- *nxt = io_put_req_find_next(req);
spin_unlock_irq(&ctx->completion_lock);
+ *nxt = io_put_req_find_next(req);
io_cqring_ev_posted(ctx);
}
@@ -4724,6 +4966,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
if (mask && !(mask & poll->events))
return 0;
+ list_del_init(&wait->entry);
+
if (poll && poll->head) {
bool done;
@@ -4764,6 +5008,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
* for write). Setup a separate io_poll_iocb if this happens.
*/
if (unlikely(poll->head)) {
+ struct io_poll_iocb *poll_one = poll;
+
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
pt->error = -EINVAL;
@@ -4774,7 +5020,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -ENOMEM;
return;
}
- io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
+ io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
*poll_ptr = poll;
@@ -4919,7 +5165,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
- io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4928,6 +5173,12 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
mask |= POLLIN | POLLRDNORM;
if (def->pollout)
mask |= POLLOUT | POLLWRNORM;
+
+ /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
+ if ((req->opcode == IORING_OP_RECVMSG) &&
+ (req->sr_msg.msg_flags & MSG_ERRQUEUE))
+ mask &= ~POLLIN;
+
mask |= POLLERR | POLLPRI;
ipt.pt._qproc = io_async_queue_proc;
@@ -4986,15 +5237,17 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
- req->flags |= REQ_F_COMP_LOCKED;
req_set_fail_links(req);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
return do_complete;
}
-static void io_poll_remove_all(struct io_ring_ctx *ctx)
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5005,13 +5258,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
struct hlist_head *list;
list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(req, tmp, list, hash_node)
- posted += io_poll_remove_one(req);
+ hlist_for_each_entry_safe(req, tmp, list, hash_node) {
+ if (io_task_match(req, tsk))
+ posted += io_poll_remove_one(req);
+ }
}
spin_unlock_irq(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
+
+ return posted != 0;
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -5079,7 +5336,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -5100,8 +5357,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
#endif
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
(events & EPOLLEXCLUSIVE);
-
- io_get_req_task(req);
return 0;
}
@@ -5140,16 +5395,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ list_del_init(&req->timeout.list);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
- /*
- * We could be racing with timeout deletion. If the list is empty,
- * then timeout lookup already found it and will be handling it.
- */
- if (!list_empty(&req->timeout.list))
- list_del_init(&req->timeout.list);
-
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -5162,18 +5411,17 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
static int __io_timeout_cancel(struct io_kiocb *req)
{
+ struct io_timeout_data *io = req->async_data;
int ret;
- list_del_init(&req->timeout.list);
-
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ ret = hrtimer_try_to_cancel(&io->timer);
if (ret == -1)
return -EALREADY;
+ list_del_init(&req->timeout.list);
req_set_fail_links(req);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return 0;
}
@@ -5202,14 +5450,10 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len)
- return -EINVAL;
-
- req->timeout.addr = READ_ONCE(sqe->addr);
- req->timeout.flags = READ_ONCE(sqe->timeout_flags);
- if (req->timeout.flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
return -EINVAL;
+ req->timeout_rem.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5222,7 +5466,7 @@ static int io_timeout_remove(struct io_kiocb *req)
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout.addr);
+ ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5253,10 +5497,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->timeout.off = off;
- if (!req->io && io_alloc_async_ctx(req))
+ if (!req->async_data && io_alloc_async_data(req))
return -ENOMEM;
- data = &req->io->timeout;
+ data = req->async_data;
data->req = req;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
@@ -5274,7 +5518,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data = &req->io->timeout;
+ struct io_timeout_data *data = req->async_data;
struct list_head *entry;
u32 tail, off = req->timeout.off;
@@ -5399,6 +5643,8 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
+ return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->rw_flags)
@@ -5435,118 +5681,86 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
return 0;
}
-static int io_req_defer_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret = 0;
-
- if (!sqe)
- return 0;
-
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
- ret = io_prep_work_files(req);
- if (unlikely(ret))
- return ret;
-
switch (req->opcode) {
case IORING_OP_NOP:
- break;
+ return 0;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- ret = io_read_prep(req, sqe, true);
- break;
+ return io_read_prep(req, sqe);
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- ret = io_write_prep(req, sqe, true);
- break;
+ return io_write_prep(req, sqe);
case IORING_OP_POLL_ADD:
- ret = io_poll_add_prep(req, sqe);
- break;
+ return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove_prep(req, sqe);
- break;
+ return io_poll_remove_prep(req, sqe);
case IORING_OP_FSYNC:
- ret = io_prep_fsync(req, sqe);
- break;
+ return io_prep_fsync(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_prep_sfr(req, sqe);
- break;
+ return io_prep_sfr(req, sqe);
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
- ret = io_sendmsg_prep(req, sqe);
- break;
+ return io_sendmsg_prep(req, sqe);
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
- ret = io_recvmsg_prep(req, sqe);
- break;
+ return io_recvmsg_prep(req, sqe);
case IORING_OP_CONNECT:
- ret = io_connect_prep(req, sqe);
- break;
+ return io_connect_prep(req, sqe);
case IORING_OP_TIMEOUT:
- ret = io_timeout_prep(req, sqe, false);
- break;
+ return io_timeout_prep(req, sqe, false);
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove_prep(req, sqe);
- break;
+ return io_timeout_remove_prep(req, sqe);
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel_prep(req, sqe);
- break;
+ return io_async_cancel_prep(req, sqe);
case IORING_OP_LINK_TIMEOUT:
- ret = io_timeout_prep(req, sqe, true);
- break;
+ return io_timeout_prep(req, sqe, true);
case IORING_OP_ACCEPT:
- ret = io_accept_prep(req, sqe);
- break;
+ return io_accept_prep(req, sqe);
case IORING_OP_FALLOCATE:
- ret = io_fallocate_prep(req, sqe);
- break;
+ return io_fallocate_prep(req, sqe);
case IORING_OP_OPENAT:
- ret = io_openat_prep(req, sqe);
- break;
+ return io_openat_prep(req, sqe);
case IORING_OP_CLOSE:
- ret = io_close_prep(req, sqe);
- break;
+ return io_close_prep(req, sqe);
case IORING_OP_FILES_UPDATE:
- ret = io_files_update_prep(req, sqe);
- break;
+ return io_files_update_prep(req, sqe);
case IORING_OP_STATX:
- ret = io_statx_prep(req, sqe);
- break;
+ return io_statx_prep(req, sqe);
case IORING_OP_FADVISE:
- ret = io_fadvise_prep(req, sqe);
- break;
+ return io_fadvise_prep(req, sqe);
case IORING_OP_MADVISE:
- ret = io_madvise_prep(req, sqe);
- break;
+ return io_madvise_prep(req, sqe);
case IORING_OP_OPENAT2:
- ret = io_openat2_prep(req, sqe);
- break;
+ return io_openat2_prep(req, sqe);
case IORING_OP_EPOLL_CTL:
- ret = io_epoll_ctl_prep(req, sqe);
- break;
+ return io_epoll_ctl_prep(req, sqe);
case IORING_OP_SPLICE:
- ret = io_splice_prep(req, sqe);
- break;
+ return io_splice_prep(req, sqe);
case IORING_OP_PROVIDE_BUFFERS:
- ret = io_provide_buffers_prep(req, sqe);
- break;
+ return io_provide_buffers_prep(req, sqe);
case IORING_OP_REMOVE_BUFFERS:
- ret = io_remove_buffers_prep(req, sqe);
- break;
+ return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
- ret = io_tee_prep(req, sqe);
- break;
- default:
- printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
- req->opcode);
- ret = -EINVAL;
- break;
+ return io_tee_prep(req, sqe);
}
- return ret;
+ printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
+ req->opcode);
+ return-EINVAL;
+}
+
+static int io_req_defer_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (!sqe)
+ return 0;
+ if (io_alloc_async_data(req))
+ return -EAGAIN;
+ return io_req_prep(req, sqe);
}
static u32 io_get_sequence(struct io_kiocb *req)
@@ -5580,7 +5794,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->io) {
+ if (!req->async_data) {
ret = io_req_defer_prep(req, sqe);
if (ret)
return ret;
@@ -5606,10 +5820,24 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EIOCBQUEUED;
}
-static void __io_clean_op(struct io_kiocb *req)
+static void io_req_drop_files(struct io_kiocb *req)
{
- struct io_async_ctx *io = req->io;
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ put_files_struct(req->work.identity->files);
+ put_nsproxy(req->work.identity->nsproxy);
+ req->work.flags &= ~IO_WQ_WORK_FILES;
+}
+
+static void __io_clean_op(struct io_kiocb *req)
+{
if (req->flags & REQ_F_BUFFER_SELECTED) {
switch (req->opcode) {
case IORING_OP_READV:
@@ -5632,39 +5860,39 @@ static void __io_clean_op(struct io_kiocb *req)
case IORING_OP_READ:
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- if (io->rw.free_iovec)
- kfree(io->rw.free_iovec);
+ case IORING_OP_WRITE: {
+ struct io_async_rw *io = req->async_data;
+ if (io->free_iovec)
+ kfree(io->free_iovec);
break;
+ }
case IORING_OP_RECVMSG:
- case IORING_OP_SENDMSG:
- if (io->msg.iov != io->msg.fast_iov)
- kfree(io->msg.iov);
+ case IORING_OP_SENDMSG: {
+ struct io_async_msghdr *io = req->async_data;
+ if (io->iov != io->fast_iov)
+ kfree(io->iov);
break;
+ }
case IORING_OP_SPLICE:
case IORING_OP_TEE:
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
+ case IORING_OP_OPENAT:
+ case IORING_OP_OPENAT2:
+ if (req->open.filename)
+ putname(req->open.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- req->flags &= ~REQ_F_INFLIGHT;
- }
+ if (req->flags & REQ_F_INFLIGHT)
+ io_req_drop_files(req);
}
-static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock, struct io_comp_state *cs)
+static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -5676,221 +5904,89 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- if (sqe) {
- ret = io_read_prep(req, sqe, force_nonblock);
- if (ret < 0)
- break;
- }
ret = io_read(req, force_nonblock, cs);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- if (sqe) {
- ret = io_write_prep(req, sqe, force_nonblock);
- if (ret < 0)
- break;
- }
ret = io_write(req, force_nonblock, cs);
break;
case IORING_OP_FSYNC:
- if (sqe) {
- ret = io_prep_fsync(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_fsync(req, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- if (sqe) {
- ret = io_poll_add_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_poll_add(req);
break;
case IORING_OP_POLL_REMOVE:
- if (sqe) {
- ret = io_poll_remove_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_poll_remove(req);
break;
case IORING_OP_SYNC_FILE_RANGE:
- if (sqe) {
- ret = io_prep_sfr(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_sync_file_range(req, force_nonblock);
break;
case IORING_OP_SENDMSG:
+ ret = io_sendmsg(req, force_nonblock, cs);
+ break;
case IORING_OP_SEND:
- if (sqe) {
- ret = io_sendmsg_prep(req, sqe);
- if (ret < 0)
- break;
- }
- if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, force_nonblock, cs);
- else
- ret = io_send(req, force_nonblock, cs);
+ ret = io_send(req, force_nonblock, cs);
break;
case IORING_OP_RECVMSG:
+ ret = io_recvmsg(req, force_nonblock, cs);
+ break;
case IORING_OP_RECV:
- if (sqe) {
- ret = io_recvmsg_prep(req, sqe);
- if (ret)
- break;
- }
- if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, force_nonblock, cs);
- else
- ret = io_recv(req, force_nonblock, cs);
+ ret = io_recv(req, force_nonblock, cs);
break;
case IORING_OP_TIMEOUT:
- if (sqe) {
- ret = io_timeout_prep(req, sqe, false);
- if (ret)
- break;
- }
ret = io_timeout(req);
break;
case IORING_OP_TIMEOUT_REMOVE:
- if (sqe) {
- ret = io_timeout_remove_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_timeout_remove(req);
break;
case IORING_OP_ACCEPT:
- if (sqe) {
- ret = io_accept_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_accept(req, force_nonblock, cs);
break;
case IORING_OP_CONNECT:
- if (sqe) {
- ret = io_connect_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_connect(req, force_nonblock, cs);
break;
case IORING_OP_ASYNC_CANCEL:
- if (sqe) {
- ret = io_async_cancel_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_async_cancel(req);
break;
case IORING_OP_FALLOCATE:
- if (sqe) {
- ret = io_fallocate_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_fallocate(req, force_nonblock);
break;
case IORING_OP_OPENAT:
- if (sqe) {
- ret = io_openat_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_openat(req, force_nonblock);
break;
case IORING_OP_CLOSE:
- if (sqe) {
- ret = io_close_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_close(req, force_nonblock, cs);
break;
case IORING_OP_FILES_UPDATE:
- if (sqe) {
- ret = io_files_update_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_files_update(req, force_nonblock, cs);
break;
case IORING_OP_STATX:
- if (sqe) {
- ret = io_statx_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_statx(req, force_nonblock);
break;
case IORING_OP_FADVISE:
- if (sqe) {
- ret = io_fadvise_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_fadvise(req, force_nonblock);
break;
case IORING_OP_MADVISE:
- if (sqe) {
- ret = io_madvise_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_madvise(req, force_nonblock);
break;
case IORING_OP_OPENAT2:
- if (sqe) {
- ret = io_openat2_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_openat2(req, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
- if (sqe) {
- ret = io_epoll_ctl_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_epoll_ctl(req, force_nonblock, cs);
break;
case IORING_OP_SPLICE:
- if (sqe) {
- ret = io_splice_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_splice(req, force_nonblock);
break;
case IORING_OP_PROVIDE_BUFFERS:
- if (sqe) {
- ret = io_provide_buffers_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_provide_buffers(req, force_nonblock, cs);
break;
case IORING_OP_REMOVE_BUFFERS:
- if (sqe) {
- ret = io_remove_buffers_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_remove_buffers(req, force_nonblock, cs);
break;
case IORING_OP_TEE:
- if (sqe) {
- ret = io_tee_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_tee(req, force_nonblock);
break;
default:
@@ -5936,7 +6032,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
if (!ret) {
do {
- ret = io_issue_sqe(req, NULL, false, NULL);
+ ret = io_issue_sqe(req, false, NULL);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -5965,20 +6061,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];
}
-static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
- int fd, struct file **out_file, bool fixed)
+static struct file *io_file_get(struct io_submit_state *state,
+ struct io_kiocb *req, int fd, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
struct file *file;
if (fixed) {
- if (unlikely(!ctx->file_data ||
- (unsigned) fd >= ctx->nr_user_files))
- return -EBADF;
+ if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+ return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
if (file) {
- req->fixed_file_refs = ctx->file_data->cur_refs;
+ req->fixed_file_refs = &ctx->file_data->node->refs;
percpu_ref_get(req->fixed_file_refs);
}
} else {
@@ -5986,11 +6081,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = __io_file_get(state, fd);
}
- if (file || io_op_defs[req->opcode].needs_file_no_error) {
- *out_file = file;
- return 0;
- }
- return -EBADF;
+ return file;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -6002,46 +6093,10 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
if (unlikely(!fixed && io_async_submit(req->ctx)))
return -EBADF;
- return io_file_get(state, req, fd, &req->file, fixed);
-}
-
-static int io_grab_files(struct io_kiocb *req)
-{
- int ret = -EBADF;
- struct io_ring_ctx *ctx = req->ctx;
-
- io_req_init_async(req);
-
- if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
+ req->file = io_file_get(state, req, fd, fixed);
+ if (req->file || io_op_defs[req->opcode].needs_file_no_error)
return 0;
- if (!ctx->ring_file)
- return -EBADF;
-
- rcu_read_lock();
- spin_lock_irq(&ctx->inflight_lock);
- /*
- * We use the f_ops->flush() handler to ensure that we can flush
- * out work accessing these files if the fd is closed. Check if
- * the fd has changed since we started down this path, and disallow
- * this operation if it has.
- */
- if (fcheck(ctx->ring_fd) == ctx->ring_file) {
- list_add(&req->inflight_entry, &ctx->inflight_list);
- req->flags |= REQ_F_INFLIGHT;
- req->work.files = current->files;
- ret = 0;
- }
- spin_unlock_irq(&ctx->inflight_lock);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int io_prep_work_files(struct io_kiocb *req)
-{
- if (!io_op_defs[req->opcode].file_table)
- return 0;
- return io_grab_files(req);
+ return -EBADF;
}
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
@@ -6088,7 +6143,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req)
* we got a chance to setup the timer
*/
if (!list_empty(&req->link_list)) {
- struct io_timeout_data *data = &req->io->timeout;
+ struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
@@ -6126,8 +6181,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_comp_state *cs)
+static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt;
@@ -6137,17 +6191,18 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
again:
linked_timeout = io_prep_linked_timeout(req);
- if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
- req->work.creds != current_cred()) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds &&
+ req->work.identity->creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
- if (old_creds == req->work.creds)
+ if (old_creds == req->work.identity->creds)
old_creds = NULL; /* restored original creds */
else
- old_creds = override_creds(req->work.creds);
+ old_creds = override_creds(req->work.identity->creds);
+ req->work.flags |= IO_WQ_WORK_CREDS;
}
- ret = io_issue_sqe(req, sqe, true, cs);
+ ret = io_issue_sqe(req, true, cs);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -6156,9 +6211,6 @@ again:
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
if (!io_arm_poll_handler(req)) {
punt:
- ret = io_prep_work_files(req);
- if (unlikely(ret))
- goto err;
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
@@ -6172,7 +6224,6 @@ punt:
}
if (unlikely(ret)) {
-err:
/* un-prep timeout, so it'll be killed as any other linked */
req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
@@ -6212,7 +6263,7 @@ fail_req:
io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- if (!req->io) {
+ if (!req->async_data) {
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
goto fail_req;
@@ -6226,7 +6277,12 @@ fail_req:
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
- __io_queue_sqe(req, sqe, cs);
+ if (sqe) {
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
+ }
+ __io_queue_sqe(req, cs);
}
}
@@ -6274,7 +6330,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
trace_io_uring_link(ctx, req, head);
- io_get_req_task(req);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
@@ -6323,9 +6378,6 @@ static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
blk_start_plug(&state->plug);
-#ifdef CONFIG_BLOCK
- state->plug.nowait = true;
-#endif
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
@@ -6382,6 +6434,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)
ctx->cached_sq_head++;
}
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+ struct io_kiocb *req,
+ unsigned int sqe_flags)
+{
+ if (!ctx->restricted)
+ return true;
+
+ if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+ return false;
+
+ if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+ ctx->restrictions.sqe_flags_required)
+ return false;
+
+ if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+ ctx->restrictions.sqe_flags_required))
+ return false;
+
+ return true;
+}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
@@ -6391,11 +6469,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct io_submit_state *state)
{
unsigned int sqe_flags;
- int id;
+ int id, ret;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
- req->io = NULL;
+ req->async_data = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -6415,17 +6493,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
return -EINVAL;
+ if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+ return -EACCES;
+
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
id = READ_ONCE(sqe->personality);
if (id) {
+ struct io_identity *iod;
+
io_req_init_async(req);
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds))
+ iod = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!iod))
return -EINVAL;
- get_cred(req->work.creds);
+ refcount_inc(&iod->count);
+ io_put_identity(current->io_uring, req);
+ get_cred(iod->creds);
+ req->work.identity = iod;
+ req->work.flags |= IO_WQ_WORK_CREDS;
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -6434,11 +6521,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (!io_op_defs[req->opcode].needs_file)
return 0;
- return io_req_set_file(state, req, READ_ONCE(sqe->fd));
+ ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
+ state->ios_left--;
+ return ret;
}
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
struct io_kiocb *link = NULL;
@@ -6447,7 +6535,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false))
+ !io_cqring_overflow_flush(ctx, false, NULL, NULL))
return -EBUSY;
}
@@ -6457,10 +6545,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- io_submit_state_start(&state, ctx, nr);
+ percpu_counter_add(&current->io_uring->inflight, nr);
+ refcount_add(nr, &current->usage);
- ctx->ring_fd = ring_fd;
- ctx->ring_file = ring_file;
+ io_submit_state_start(&state, ctx, nr);
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
@@ -6478,12 +6566,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted = -EAGAIN;
break;
}
-
- err = io_init_req(ctx, req, sqe, &state);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
+ err = io_init_req(ctx, req, sqe, &state);
if (unlikely(err)) {
fail_req:
io_put_req(req);
@@ -6500,8 +6587,12 @@ fail_req:
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
+ struct io_uring_task *tctx = current->io_uring;
+ int unused = nr - ref_used;
- percpu_ref_put_many(&ctx->refs, nr - ref_used);
+ percpu_ref_put_many(&ctx->refs, unused);
+ percpu_counter_sub(&tctx->inflight, unused);
+ put_task_struct_many(current, unused);
}
if (link)
io_queue_link_head(link, &state.comp);
@@ -6528,117 +6619,190 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_thread(void *data)
+static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
+ int sync, void *key)
{
- struct io_ring_ctx *ctx = data;
- const struct cred *old_cred;
- DEFINE_WAIT(wait);
- unsigned long timeout;
+ struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
+ int ret;
+
+ ret = autoremove_wake_function(wqe, mode, sync, key);
+ if (ret) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ }
+ return ret;
+}
+
+enum sq_ret {
+ SQT_IDLE = 1,
+ SQT_SPIN = 2,
+ SQT_DID_WORK = 4,
+};
+
+static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
+ unsigned long start_jiffies, bool cap_entries)
+{
+ unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
+ struct io_sq_data *sqd = ctx->sq_data;
+ unsigned int to_submit;
int ret = 0;
- complete(&ctx->sq_thread_comp);
+again:
+ if (!list_empty(&ctx->iopoll_list)) {
+ unsigned nr_events = 0;
+
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ io_do_iopoll(ctx, &nr_events, 0);
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ to_submit = io_sqring_entries(ctx);
- old_cred = override_creds(ctx->creds);
+ /*
+ * If submit got -EBUSY, flag us as needing the application
+ * to enter the kernel to reap and flush events.
+ */
+ if (!to_submit || ret == -EBUSY || need_resched()) {
+ /*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ io_sq_thread_drop_mm();
- timeout = jiffies + ctx->sq_thread_idle;
- while (!kthread_should_park()) {
- unsigned int to_submit;
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep. The exception is if we got EBUSY doing
+ * more IO, we should wait for the application to
+ * reap events and wake us up.
+ */
+ if (!list_empty(&ctx->iopoll_list) || need_resched() ||
+ (!time_after(jiffies, timeout) && ret != -EBUSY &&
+ !percpu_ref_is_dying(&ctx->refs)))
+ return SQT_SPIN;
- if (!list_empty(&ctx->iopoll_list)) {
- unsigned nr_events = 0;
+ prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
+ TASK_INTERRUPTIBLE);
- mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
- io_do_iopoll(ctx, &nr_events, 0);
- else
- timeout = jiffies + ctx->sq_thread_idle;
- mutex_unlock(&ctx->uring_lock);
+ /*
+ * While doing polled IO, before going to sleep, we need
+ * to check if there are new reqs added to iopoll_list,
+ * it is because reqs may have been punted to io worker
+ * and will be added to iopoll_list later, hence check
+ * the iopoll_list again.
+ */
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ goto again;
}
to_submit = io_sqring_entries(ctx);
+ if (!to_submit || ret == -EBUSY)
+ return SQT_IDLE;
+ }
+
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ io_ring_clear_wakeup_flag(ctx);
+
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ mutex_lock(&ctx->uring_lock);
+ if (likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
+
+ return SQT_DID_WORK;
+}
+
+static void io_sqd_init_new(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+
+ while (!list_empty(&sqd->ctx_new_list)) {
+ ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
+ init_wait(&ctx->sqo_wait_entry);
+ ctx->sqo_wait_entry.func = io_sq_wake_function;
+ list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
+ complete(&ctx->sq_thread_comp);
+ }
+}
+
+static int io_sq_thread(void *data)
+{
+ struct cgroup_subsys_state *cur_css = NULL;
+ const struct cred *old_cred = NULL;
+ struct io_sq_data *sqd = data;
+ struct io_ring_ctx *ctx;
+ unsigned long start_jiffies;
+
+ start_jiffies = jiffies;
+ while (!kthread_should_stop()) {
+ enum sq_ret ret = 0;
+ bool cap_entries;
/*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
+ * Any changes to the sqd lists are synchronized through the
+ * kthread parking. This synchronizes the thread vs users,
+ * the users are synchronized on the sqd->ctx_lock.
*/
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
+ if (kthread_should_park())
+ kthread_parkme();
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs))) {
- io_run_task_work();
- cond_resched();
- continue;
- }
+ if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ io_sqd_init_new(sqd);
- prepare_to_wait(&ctx->sqo_wait, &wait,
- TASK_INTERRUPTIBLE);
+ cap_entries = !list_is_singular(&sqd->ctx_list);
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&ctx->sqo_wait, &wait);
- continue;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (current->cred != ctx->creds) {
+ if (old_cred)
+ revert_creds(old_cred);
+ old_cred = override_creds(ctx->creds);
}
+ io_sq_thread_associate_blkcg(ctx, &cur_css);
+#ifdef CONFIG_AUDIT
+ current->loginuid = ctx->loginuid;
+ current->sessionid = ctx->sessionid;
+#endif
- io_ring_set_wakeup_flag(ctx);
+ ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY) {
- if (kthread_should_park()) {
- finish_wait(&ctx->sqo_wait, &wait);
- break;
- }
- if (io_run_task_work()) {
- finish_wait(&ctx->sqo_wait, &wait);
- io_ring_clear_wakeup_flag(ctx);
- continue;
- }
- if (signal_pending(current))
- flush_signals(current);
- schedule();
- finish_wait(&ctx->sqo_wait, &wait);
+ io_sq_thread_drop_mm();
+ }
- io_ring_clear_wakeup_flag(ctx);
- ret = 0;
+ if (ret & SQT_SPIN) {
+ io_run_task_work();
+ cond_resched();
+ } else if (ret == SQT_IDLE) {
+ if (kthread_should_park())
continue;
- }
- finish_wait(&ctx->sqo_wait, &wait);
-
- io_ring_clear_wakeup_flag(ctx);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_ring_set_wakeup_flag(ctx);
+ schedule();
+ start_jiffies = jiffies;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_ring_clear_wakeup_flag(ctx);
}
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit, NULL, -1);
- mutex_unlock(&ctx->uring_lock);
- timeout = jiffies + ctx->sq_thread_idle;
}
io_run_task_work();
- io_sq_thread_drop_mm();
- revert_creds(old_cred);
+ if (cur_css)
+ io_sq_thread_unassociate_blkcg();
+ if (old_cred)
+ revert_creds(old_cred);
kthread_parkme();
@@ -6678,6 +6842,22 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
return autoremove_wake_function(curr, mode, wake_flags, key);
}
+static int io_run_task_work_sig(void)
+{
+ if (io_run_task_work())
+ return 1;
+ if (!signal_pending(current))
+ return 0;
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 1;
+ }
+ return -EINTR;
+}
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -6723,19 +6903,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
/* make sure we run task_work before checking for signals */
- if (io_run_task_work())
+ ret = io_run_task_work_sig();
+ if (ret > 0)
continue;
- if (signal_pending(current)) {
- if (current->jobctl & JOBCTL_TASK_WORK) {
- spin_lock_irq(&current->sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- continue;
- }
- ret = -EINTR;
+ else if (ret < 0)
break;
- }
if (io_should_wake(&iowq, false))
break;
schedule();
@@ -6813,18 +6985,116 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return 0;
}
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+static void io_put_sq_data(struct io_sq_data *sqd)
{
- if (ctx->sqo_thread) {
- wait_for_completion(&ctx->sq_thread_comp);
+ if (refcount_dec_and_test(&sqd->refs)) {
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
* set to a single CPU.
*/
- kthread_park(ctx->sqo_thread);
- kthread_stop(ctx->sqo_thread);
- ctx->sqo_thread = NULL;
+ if (sqd->thread) {
+ kthread_park(sqd->thread);
+ kthread_stop(sqd->thread);
+ }
+
+ kfree(sqd);
+ }
+}
+
+static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx_attach;
+ struct io_sq_data *sqd;
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return ERR_PTR(-ENXIO);
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ctx_attach = f.file->private_data;
+ sqd = ctx_attach->sq_data;
+ if (!sqd) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ refcount_inc(&sqd->refs);
+ fdput(f);
+ return sqd;
+}
+
+static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
+{
+ struct io_sq_data *sqd;
+
+ if (p->flags & IORING_SETUP_ATTACH_WQ)
+ return io_attach_sq_data(p);
+
+ sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
+ if (!sqd)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&sqd->refs, 1);
+ INIT_LIST_HEAD(&sqd->ctx_list);
+ INIT_LIST_HEAD(&sqd->ctx_new_list);
+ mutex_init(&sqd->ctx_lock);
+ mutex_init(&sqd->lock);
+ init_waitqueue_head(&sqd->wait);
+ return sqd;
+}
+
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+ __releases(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ kthread_unpark(sqd->thread);
+ mutex_unlock(&sqd->lock);
+}
+
+static void io_sq_thread_park(struct io_sq_data *sqd)
+ __acquires(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ mutex_lock(&sqd->lock);
+ kthread_park(sqd->thread);
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd) {
+ if (sqd->thread) {
+ /*
+ * We may arrive here from the error branch in
+ * io_sq_offload_create() where the kthread is created
+ * without being waked up, thus wake it up now to make
+ * sure the wait will complete.
+ */
+ wake_up_process(sqd->thread);
+ wait_for_completion(&ctx->sq_thread_comp);
+
+ io_sq_thread_park(sqd);
+ }
+
+ mutex_lock(&sqd->ctx_lock);
+ list_del(&ctx->sqd_list);
+ mutex_unlock(&sqd->ctx_lock);
+
+ if (sqd->thread) {
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ io_sq_thread_unpark(sqd);
+ }
+
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
}
}
@@ -6935,13 +7205,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
- unsigned nr_files)
+static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+ unsigned nr_tables, unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_data->table[i];
+ struct fixed_file_table *table = &file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -6956,7 +7226,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_data->table[i];
+ struct fixed_file_table *table = &file_data->table[i];
kfree(table->files);
}
return 1;
@@ -7118,11 +7388,11 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
- unsigned nr_tables;
+ unsigned nr_tables, i;
struct file *file;
- int fd, ret = 0;
- unsigned i;
+ int fd, ret = -ENOMEM;
struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7131,60 +7401,44 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
- ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
- if (!ctx->file_data)
+ file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+ if (!file_data)
return -ENOMEM;
- ctx->file_data->ctx = ctx;
- init_completion(&ctx->file_data->done);
- INIT_LIST_HEAD(&ctx->file_data->ref_list);
- spin_lock_init(&ctx->file_data->lock);
+ file_data->ctx = ctx;
+ init_completion(&file_data->done);
+ INIT_LIST_HEAD(&file_data->ref_list);
+ spin_lock_init(&file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- ctx->file_data->table = kcalloc(nr_tables,
- sizeof(struct fixed_file_table),
- GFP_KERNEL);
- if (!ctx->file_data->table) {
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
+ GFP_KERNEL);
+ if (!file_data->table)
+ goto out_free;
- if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto out_free;
- if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
- percpu_ref_exit(&ctx->file_data->refs);
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+ goto out_ref;
+ ctx->file_data = file_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
struct fixed_file_table *table;
unsigned index;
- ret = -EFAULT;
- if (copy_from_user(&fd, &fds[i], sizeof(fd)))
- break;
+ if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ ret = -EFAULT;
+ goto out_fput;
+ }
/* allow sparse sets */
- if (fd == -1) {
- ret = 0;
+ if (fd == -1)
continue;
- }
- table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- index = i & IORING_FILE_TABLE_MASK;
file = fget(fd);
-
ret = -EBADF;
if (!file)
- break;
+ goto out_fput;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -7195,29 +7449,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (file->f_op == &io_uring_fops) {
fput(file);
- break;
+ goto out_fput;
}
- ret = 0;
+ table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
table->files[index] = file;
}
- if (ret) {
- for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
- for (i = 0; i < nr_tables; i++)
- kfree(ctx->file_data->table[i].files);
-
- percpu_ref_exit(&ctx->file_data->refs);
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
- return ret;
- }
-
ret = io_sqe_files_scm(ctx);
if (ret) {
io_sqe_files_unregister(ctx);
@@ -7230,11 +7468,27 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return PTR_ERR(ref_node);
}
- ctx->file_data->cur_refs = &ref_node->refs;
- spin_lock(&ctx->file_data->lock);
- list_add(&ref_node->node, &ctx->file_data->ref_list);
- spin_unlock(&ctx->file_data->lock);
- percpu_ref_get(&ctx->file_data->refs);
+ file_data->node = ref_node;
+ spin_lock(&file_data->lock);
+ list_add(&ref_node->node, &file_data->ref_list);
+ spin_unlock(&file_data->lock);
+ percpu_ref_get(&file_data->refs);
+ return ret;
+out_fput:
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
+ for (i = 0; i < nr_tables; i++)
+ kfree(file_data->table[i].files);
+ ctx->nr_user_files = 0;
+out_ref:
+ percpu_ref_exit(&file_data->refs);
+out_free:
+ kfree(file_data->table);
+ kfree(file_data);
+ ctx->file_data = NULL;
return ret;
}
@@ -7285,14 +7539,12 @@ static int io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
struct io_file_put *pfile;
- struct percpu_ref *refs = data->cur_refs;
- struct fixed_file_ref_node *ref_node;
+ struct fixed_file_ref_node *ref_node = data->node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
- ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
list_add(&pfile->list, &ref_node->file_list);
@@ -7375,10 +7627,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
if (needs_switch) {
- percpu_ref_kill(data->cur_refs);
+ percpu_ref_kill(&data->node->refs);
spin_lock(&data->lock);
list_add(&ref_node->node, &data->ref_list);
- data->cur_refs = &ref_node->refs;
+ data->node = ref_node;
spin_unlock(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
@@ -7459,20 +7711,76 @@ out_fput:
return ret;
}
-static int io_sq_offload_start(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static int io_uring_alloc_task_context(struct task_struct *task)
+{
+ struct io_uring_task *tctx;
+ int ret;
+
+ tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ if (unlikely(!tctx))
+ return -ENOMEM;
+
+ ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
+ if (unlikely(ret)) {
+ kfree(tctx);
+ return ret;
+ }
+
+ xa_init(&tctx->xa);
+ init_waitqueue_head(&tctx->wait);
+ tctx->last = NULL;
+ tctx->in_idle = 0;
+ io_init_identity(&tctx->__identity);
+ tctx->identity = &tctx->__identity;
+ task->io_uring = tctx;
+ return 0;
+}
+
+void __io_uring_free(struct task_struct *tsk)
+{
+ struct io_uring_task *tctx = tsk->io_uring;
+
+ WARN_ON_ONCE(!xa_empty(&tctx->xa));
+ WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
+ if (tctx->identity != &tctx->__identity)
+ kfree(tctx->identity);
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ tsk->io_uring = NULL;
+}
+
+static int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_sq_data *sqd;
+
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto err;
+ sqd = io_get_sq_data(p);
+ if (IS_ERR(sqd)) {
+ ret = PTR_ERR(sqd);
+ goto err;
+ }
+
+ ctx->sq_data = sqd;
+ io_sq_thread_park(sqd);
+ mutex_lock(&sqd->ctx_lock);
+ list_add(&ctx->sqd_list, &sqd->ctx_new_list);
+ mutex_unlock(&sqd->ctx_lock);
+ io_sq_thread_unpark(sqd);
+
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
if (!ctx->sq_thread_idle)
ctx->sq_thread_idle = HZ;
+ if (sqd->thread)
+ goto done;
+
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
@@ -7482,25 +7790,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
if (!cpu_online(cpu))
goto err;
- ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
- ctx, cpu,
- "io_uring-sq");
+ sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
+ cpu, "io_uring-sq");
} else {
- ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+ sqd->thread = kthread_create(io_sq_thread, sqd,
"io_uring-sq");
}
- if (IS_ERR(ctx->sqo_thread)) {
- ret = PTR_ERR(ctx->sqo_thread);
- ctx->sqo_thread = NULL;
+ if (IS_ERR(sqd->thread)) {
+ ret = PTR_ERR(sqd->thread);
+ sqd->thread = NULL;
goto err;
}
- wake_up_process(ctx->sqo_thread);
+ ret = io_uring_alloc_task_context(sqd->thread);
+ if (ret)
+ goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
/* Can't have SQ_AFF without SQPOLL */
ret = -EINVAL;
goto err;
}
+done:
ret = io_init_wq_offload(ctx, p);
if (ret)
goto err;
@@ -7511,6 +7821,14 @@ err:
return ret;
}
+static void io_sq_offload_start(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
+ wake_up_process(sqd->thread);
+}
+
static inline void __io_unaccount_mem(struct user_struct *user,
unsigned long nr_pages)
{
@@ -7542,11 +7860,11 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
if (ctx->limit_mem)
__io_unaccount_mem(ctx->user, nr_pages);
- if (ctx->sqo_mm) {
+ if (ctx->mm_account) {
if (acct == ACCT_LOCKED)
- ctx->sqo_mm->locked_vm -= nr_pages;
+ ctx->mm_account->locked_vm -= nr_pages;
else if (acct == ACCT_PINNED)
- atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+ atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
}
@@ -7561,11 +7879,11 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
return ret;
}
- if (ctx->sqo_mm) {
+ if (ctx->mm_account) {
if (acct == ACCT_LOCKED)
- ctx->sqo_mm->locked_vm += nr_pages;
+ ctx->mm_account->locked_vm += nr_pages;
else if (acct == ACCT_PINNED)
- atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+ atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
}
return 0;
@@ -7645,7 +7963,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
for (j = 0; j < imu->nr_bvecs; j++)
unpin_user_page(imu->bvec[j].bv_page);
- io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -7681,11 +8000,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
return 0;
}
+/*
+ * Not super efficient, but this is just a registration time. And we do cache
+ * the last compound head, so generally we'll only do a full search if we don't
+ * match that one.
+ *
+ * We check if the given compound head page has already been accounted, to
+ * avoid double accounting it. This allows us to account the full size of the
+ * page, not just the constituent pages of a huge page.
+ */
+static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
+ int nr_pages, struct page *hpage)
+{
+ int i, j;
+
+ /* check current page array */
+ for (i = 0; i < nr_pages; i++) {
+ if (!PageCompound(pages[i]))
+ continue;
+ if (compound_head(pages[i]) == hpage)
+ return true;
+ }
+
+ /* check previously registered pages */
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+ for (j = 0; j < imu->nr_bvecs; j++) {
+ if (!PageCompound(imu->bvec[j].bv_page))
+ continue;
+ if (compound_head(imu->bvec[j].bv_page) == hpage)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
+ int nr_pages, struct io_mapped_ubuf *imu,
+ struct page **last_hpage)
+{
+ int i, ret;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (!PageCompound(pages[i])) {
+ imu->acct_pages++;
+ } else {
+ struct page *hpage;
+
+ hpage = compound_head(pages[i]);
+ if (hpage == *last_hpage)
+ continue;
+ *last_hpage = hpage;
+ if (headpage_already_acct(ctx, pages, i, hpage))
+ continue;
+ imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
+ }
+ }
+
+ if (!imu->acct_pages)
+ return 0;
+
+ ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ if (ret)
+ imu->acct_pages = 0;
+ return ret;
+}
+
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
+ struct page *last_hpage = NULL;
int i, j, got_pages = 0;
int ret = -EINVAL;
@@ -7728,10 +8116,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
- if (ret)
- goto err;
-
ret = 0;
if (!pages || nr_pages > got_pages) {
kvfree(vmas);
@@ -7743,7 +8127,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
if (!pages || !vmas) {
ret = -ENOMEM;
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
got_pages = nr_pages;
@@ -7752,10 +8135,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
GFP_KERNEL);
ret = -ENOMEM;
- if (!imu->bvec) {
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
+ if (!imu->bvec)
goto err;
- }
ret = 0;
mmap_read_lock(current->mm);
@@ -7784,7 +8165,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (pret > 0)
unpin_user_pages(pages, pret);
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
+ kvfree(imu->bvec);
+ goto err;
+ }
+
+ ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
+ if (ret) {
+ unpin_user_pages(pages, pret);
kvfree(imu->bvec);
goto err;
}
@@ -7869,11 +8256,19 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
io_sqe_buffer_unregister(ctx);
- if (ctx->sqo_mm) {
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
+
+ if (ctx->sqo_task) {
+ put_task_struct(ctx->sqo_task);
+ ctx->sqo_task = NULL;
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (ctx->sqo_blkcg_css)
+ css_put(ctx->sqo_blkcg_css);
+#endif
+
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
@@ -7908,8 +8303,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* io_commit_cqring
*/
smp_rmb();
- if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
- ctx->rings->sq_ring_entries)
+ if (!io_sqring_full(ctx))
mask |= EPOLLOUT | EPOLLWRNORM;
if (io_cqring_events(ctx, false))
mask |= EPOLLIN | EPOLLRDNORM;
@@ -7927,11 +8321,14 @@ static int io_uring_fasync(int fd, struct file *file, int on)
static int io_remove_personalities(int id, void *p, void *data)
{
struct io_ring_ctx *ctx = data;
- const struct cred *cred;
+ struct io_identity *iod;
- cred = idr_remove(&ctx->personality_idr, id);
- if (cred)
- put_cred(cred);
+ iod = idr_remove(&ctx->personality_idr, id);
+ if (iod) {
+ put_cred(iod->creds);
+ if (refcount_dec_and_test(&iod->count))
+ kfree(iod);
+ }
return 0;
}
@@ -7948,7 +8345,7 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
@@ -7960,15 +8357,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx);
- io_poll_remove_all(ctx);
+ io_kill_timeouts(ctx, NULL);
+ io_poll_remove_all(ctx, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
@@ -8003,7 +8400,8 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
{
struct files_struct *files = data;
- return work->files == files;
+ return !files || ((work->flags & IO_WQ_WORK_FILES) &&
+ work->identity->files == files);
}
/*
@@ -8024,12 +8422,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
return false;
}
-static inline bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files;
-}
-
static bool io_match_link_files(struct io_kiocb *req,
struct files_struct *files)
{
@@ -8145,11 +8537,14 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+/*
+ * Returns true if we found and killed one or more files pinning requests
+ */
+static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
if (list_empty_careful(&ctx->inflight_list))
- return;
+ return false;
io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
@@ -8161,7 +8556,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (req->work.files != files)
+ if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files != files)
continue;
/* req is being completed, ignore */
if (!refcount_inc_not_zero(&req->refs))
@@ -8180,9 +8576,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
/* cancel this request, or head link requests */
io_attempt_cancel(ctx, cancel_req);
io_put_req(cancel_req);
+ /* cancellations _may_ trigger task work */
+ io_run_task_work();
schedule();
finish_wait(&ctx->inflight_wait, &wait);
}
+
+ return true;
}
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
@@ -8190,21 +8590,192 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct task_struct *task = data;
- return req->task == task;
+ return io_task_match(req, task);
+}
+
+static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
+{
+ bool ret;
+
+ ret = io_uring_cancel_files(ctx, files);
+ if (!files) {
+ enum io_wq_cancel cret;
+
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ ret = true;
+
+ /* SQPOLL thread does its own polling */
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ while (!list_empty_careful(&ctx->iopoll_list)) {
+ io_iopoll_try_reap_events(ctx);
+ ret = true;
+ }
+ }
+
+ ret |= io_poll_remove_all(ctx, task);
+ ret |= io_kill_timeouts(ctx, task);
+ }
+
+ return ret;
+}
+
+/*
+ * We need to iteratively cancel requests, in case a request has dependent
+ * hard links. These persist even for failure of cancelations, hence keep
+ * looping until none are found.
+ */
+static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct task_struct *task = current;
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+ task = ctx->sq_data->thread;
+
+ io_cqring_overflow_flush(ctx, true, task, files);
+
+ while (__io_uring_cancel_task_requests(ctx, task, files)) {
+ io_run_task_work();
+ cond_resched();
+ }
+}
+
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static int io_uring_add_task_file(struct file *file)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (unlikely(!tctx)) {
+ int ret;
+
+ ret = io_uring_alloc_task_context(current);
+ if (unlikely(ret))
+ return ret;
+ tctx = current->io_uring;
+ }
+ if (tctx->last != file) {
+ void *old = xa_load(&tctx->xa, (unsigned long)file);
+
+ if (!old) {
+ get_file(file);
+ xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+ }
+ tctx->last = file;
+ }
+
+ return 0;
+}
+
+/*
+ * Remove this io_uring_file -> task mapping.
+ */
+static void io_uring_del_task_file(struct file *file)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (tctx->last == file)
+ tctx->last = NULL;
+ file = xa_erase(&tctx->xa, (unsigned long)file);
+ if (file)
+ fput(file);
+}
+
+static void __io_uring_attempt_task_drop(struct file *file)
+{
+ struct file *old = xa_load(&current->io_uring->xa, (unsigned long)file);
+
+ if (old == file)
+ io_uring_del_task_file(file);
+}
+
+/*
+ * Drop task note for this file if we're the only ones that hold it after
+ * pending fput()
+ */
+static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+{
+ if (!current->io_uring)
+ return;
+ /*
+ * fput() is pending, will be 2 if the only other ref is our potential
+ * task file note. If the task is exiting, drop regardless of count.
+ */
+ if (!exiting && atomic_long_read(&file->f_count) != 2)
+ return;
+
+ __io_uring_attempt_task_drop(file);
+}
+
+void __io_uring_files_cancel(struct files_struct *files)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct file *file;
+ unsigned long index;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ io_uring_cancel_task_requests(ctx, files);
+ if (files)
+ io_uring_del_task_file(file);
+ }
+}
+
+/*
+ * Find any io_uring fd that this task has registered or done IO on, and cancel
+ * requests.
+ */
+void __io_uring_task_cancel(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ DEFINE_WAIT(wait);
+ s64 inflight;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ do {
+ /* read completions before cancelations */
+ inflight = percpu_counter_sum(&tctx->inflight);
+ if (!inflight)
+ break;
+ __io_uring_files_cancel(NULL);
+
+ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ /*
+ * If we've seen completions, retry. This avoids a race where
+ * a completion comes in before we did prepare_to_wait().
+ */
+ if (inflight != percpu_counter_sum(&tctx->inflight))
+ continue;
+ schedule();
+ } while (1);
+
+ finish_wait(&tctx->wait, &wait);
+ tctx->in_idle = false;
}
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
- io_uring_cancel_files(ctx, data);
-
/*
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
+ data = NULL;
+ io_uring_cancel_task_requests(ctx, data);
+ io_uring_attempt_task_drop(file, !data);
return 0;
}
@@ -8278,6 +8849,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
#endif /* !CONFIG_MMU */
+static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
+{
+ DEFINE_WAIT(wait);
+
+ do {
+ if (!io_sqring_full(ctx))
+ break;
+
+ prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
+
+ if (!io_sqring_full(ctx))
+ break;
+
+ schedule();
+ } while (!signal_pending(current));
+
+ finish_wait(&ctx->sqo_sq_wait, &wait);
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const sigset_t __user *, sig,
size_t, sigsz)
@@ -8289,7 +8879,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
- if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+ if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+ IORING_ENTER_SQ_WAIT))
return -EINVAL;
f = fdget(fd);
@@ -8305,6 +8896,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (!percpu_ref_tryget(&ctx->refs))
goto out_fput;
+ ret = -EBADFD;
+ if (ctx->flags & IORING_SETUP_R_DISABLED)
+ goto out;
+
/*
* For SQ polling, the thread will do all submissions and completions.
* Just return the requested submit count, and wake the thread if
@@ -8313,13 +8908,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (!list_empty_careful(&ctx->cq_overflow_list))
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
if (flags & IORING_ENTER_SQ_WAKEUP)
- wake_up(&ctx->sqo_wait);
+ wake_up(&ctx->sq_data->wait);
+ if (flags & IORING_ENTER_SQ_WAIT)
+ io_sqpoll_wait_sq(ctx);
submitted = to_submit;
} else if (to_submit) {
+ ret = io_uring_add_task_file(f.file);
+ if (unlikely(ret))
+ goto out;
mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
+ submitted = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
@@ -8385,11 +8985,25 @@ static int io_uring_show_cred(int id, void *p, void *data)
static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
+ struct io_sq_data *sq = NULL;
+ bool has_lock;
int i;
- mutex_lock(&ctx->uring_lock);
+ /*
+ * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+ * since fdinfo case grabs it in the opposite direction of normal use
+ * cases. If we fail to get the lock, we just don't iterate any
+ * structures that could be going away outside the io_uring mutex.
+ */
+ has_lock = mutex_trylock(&ctx->uring_lock);
+
+ if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+ sq = ctx->sq_data;
+
+ seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
+ seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; i < ctx->nr_user_files; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct fixed_file_table *table;
struct file *f;
@@ -8401,13 +9015,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "%5u: <none>\n", i);
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; i < ctx->nr_user_bufs; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
}
- if (!idr_is_empty(&ctx->personality_idr)) {
+ if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
@@ -8422,7 +9036,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
req->task->task_works != NULL);
}
spin_unlock_irq(&ctx->completion_lock);
- mutex_unlock(&ctx->uring_lock);
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
}
static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
@@ -8520,6 +9135,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
+err_fd:
put_unused_fd(ret);
ret = PTR_ERR(file);
goto err;
@@ -8528,6 +9144,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
+ if (unlikely(io_uring_add_task_file(file))) {
+ file = ERR_PTR(-ENOMEM);
+ goto err_fd;
+ }
fd_install(ret, file);
return ret;
err:
@@ -8604,9 +9224,39 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->compat = in_compat_syscall();
ctx->user = user;
ctx->creds = get_current_cred();
+#ifdef CONFIG_AUDIT
+ ctx->loginuid = current->loginuid;
+ ctx->sessionid = current->sessionid;
+#endif
+ ctx->sqo_task = get_task_struct(current);
+ /*
+ * This is just grabbed for accounting purposes. When a process exits,
+ * the mm is exited and dropped before the files, hence we need to hang
+ * on to this mm purely for the purposes of being able to unaccount
+ * memory (locked/pinned vm). It's not used for anything else.
+ */
mmgrab(current->mm);
- ctx->sqo_mm = current->mm;
+ ctx->mm_account = current->mm;
+
+#ifdef CONFIG_BLK_CGROUP
+ /*
+ * The sq thread will belong to the original cgroup it was inited in.
+ * If the cgroup goes offline (e.g. disabling the io controller), then
+ * issued bios will be associated with the closest cgroup later in the
+ * block layer.
+ */
+ rcu_read_lock();
+ ctx->sqo_blkcg_css = blkcg_css();
+ ret = css_tryget_online(ctx->sqo_blkcg_css);
+ rcu_read_unlock();
+ if (!ret) {
+ /* don't init against a dying cgroup, have the user try again */
+ ctx->sqo_blkcg_css = NULL;
+ ret = -ENODEV;
+ goto err;
+ }
+#endif
/*
* Account memory _before_ installing the file descriptor. Once
@@ -8622,10 +9272,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
- ret = io_sq_offload_start(ctx, p);
+ ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
+ if (!(p->flags & IORING_SETUP_R_DISABLED))
+ io_sq_offload_start(ctx);
+
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -8688,7 +9341,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
+ IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
+ IORING_SETUP_R_DISABLED))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -8741,29 +9395,124 @@ out:
static int io_register_personality(struct io_ring_ctx *ctx)
{
- const struct cred *creds = get_current_cred();
- int id;
+ struct io_identity *id;
+ int ret;
+
+ id = kmalloc(sizeof(*id), GFP_KERNEL);
+ if (unlikely(!id))
+ return -ENOMEM;
+
+ io_init_identity(id);
+ id->creds = get_current_cred();
- id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
- USHRT_MAX, GFP_KERNEL);
- if (id < 0)
- put_cred(creds);
- return id;
+ ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
+ if (ret < 0) {
+ put_cred(id->creds);
+ kfree(id);
+ }
+ return ret;
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- const struct cred *old_creds;
+ struct io_identity *iod;
- old_creds = idr_remove(&ctx->personality_idr, id);
- if (old_creds) {
- put_cred(old_creds);
+ iod = idr_remove(&ctx->personality_idr, id);
+ if (iod) {
+ put_cred(iod->creds);
+ if (refcount_dec_and_test(&iod->count))
+ kfree(iod);
return 0;
}
return -EINVAL;
}
+static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ struct io_uring_restriction *res;
+ size_t size;
+ int i, ret;
+
+ /* Restrictions allowed only if rings started disabled */
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ /* We allow only a single restrictions registration */
+ if (ctx->restrictions.registered)
+ return -EBUSY;
+
+ if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
+ return -EINVAL;
+
+ size = array_size(nr_args, sizeof(*res));
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ res = memdup_user(arg, size);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = 0;
+
+ for (i = 0; i < nr_args; i++) {
+ switch (res[i].opcode) {
+ case IORING_RESTRICTION_REGISTER_OP:
+ if (res[i].register_op >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].register_op,
+ ctx->restrictions.register_op);
+ break;
+ case IORING_RESTRICTION_SQE_OP:
+ if (res[i].sqe_op >= IORING_OP_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
+ ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
+ ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ /* Reset all restrictions if an error happened */
+ if (ret != 0)
+ memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
+ else
+ ctx->restrictions.registered = true;
+
+ kfree(res);
+ return ret;
+}
+
+static int io_register_enable_rings(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ if (ctx->restrictions.registered)
+ ctx->restricted = 1;
+
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+
+ io_sq_offload_start(ctx);
+
+ return 0;
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
@@ -8805,11 +9554,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
+ do {
+ ret = wait_for_completion_interruptible(&ctx->ref_comp);
+ if (!ret)
+ break;
+ ret = io_run_task_work_sig();
+ if (ret < 0)
+ break;
+ } while (1);
+
mutex_lock(&ctx->uring_lock);
+
if (ret) {
percpu_ref_resurrect(&ctx->refs);
- ret = -EINTR;
+ goto out_quiesce;
+ }
+ }
+
+ if (ctx->restricted) {
+ if (opcode >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!test_bit(opcode, ctx->restrictions.register_op)) {
+ ret = -EACCES;
goto out;
}
}
@@ -8873,15 +9642,25 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_unregister_personality(ctx, nr_args);
break;
+ case IORING_REGISTER_ENABLE_RINGS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_enable_rings(ctx);
+ break;
+ case IORING_REGISTER_RESTRICTIONS:
+ ret = io_register_restrictions(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
+out:
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
-out:
+out_quiesce:
reinit_completion(&ctx->ref_comp);
}
return ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bcfc288dba3f..8180061b9e16 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -22,18 +22,25 @@
#include "../internal.h"
/*
- * Structure allocated for each page when block size < PAGE_SIZE to track
- * sub-page uptodate status and I/O completions.
+ * Structure allocated for each page or THP when block size < page size
+ * to track sub-page uptodate status and I/O completions.
*/
struct iomap_page {
- atomic_t read_count;
- atomic_t write_count;
+ atomic_t read_bytes_pending;
+ atomic_t write_bytes_pending;
spinlock_t uptodate_lock;
- DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+ unsigned long uptodate[];
};
static inline struct iomap_page *to_iomap_page(struct page *page)
{
+ /*
+ * per-block data is stored in the head page. Callers should
+ * not be dealing with tail pages (and if they are, they can
+ * call thp_head() first.
+ */
+ VM_BUG_ON_PGFLAGS(PageTail(page), page);
+
if (page_has_private(page))
return (struct iomap_page *)page_private(page);
return NULL;
@@ -45,20 +52,16 @@ static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
struct iomap_page *iop = to_iomap_page(page);
+ unsigned int nr_blocks = i_blocks_per_page(inode, page);
- if (iop || i_blocksize(inode) == PAGE_SIZE)
+ if (iop || nr_blocks <= 1)
return iop;
- iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
- atomic_set(&iop->read_count, 0);
- atomic_set(&iop->write_count, 0);
+ iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
+ GFP_NOFS | __GFP_NOFAIL);
spin_lock_init(&iop->uptodate_lock);
- bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
-
- /*
- * migrate_page_move_mapping() assumes that pages with private data have
- * their count elevated by 1.
- */
+ if (PageUptodate(page))
+ bitmap_fill(iop->uptodate, nr_blocks);
attach_page_private(page, iop);
return iop;
}
@@ -67,11 +70,14 @@ static void
iomap_page_release(struct page *page)
{
struct iomap_page *iop = detach_page_private(page);
+ unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
if (!iop)
return;
- WARN_ON_ONCE(atomic_read(&iop->read_count));
- WARN_ON_ONCE(atomic_read(&iop->write_count));
+ WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
+ WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
+ WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+ PageUptodate(page));
kfree(iop);
}
@@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len)
struct inode *inode = page->mapping->host;
unsigned first = off >> inode->i_blkbits;
unsigned last = (off + len - 1) >> inode->i_blkbits;
- bool uptodate = true;
unsigned long flags;
- unsigned int i;
spin_lock_irqsave(&iop->uptodate_lock, flags);
- for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
- if (i >= first && i <= last)
- set_bit(i, iop->uptodate);
- else if (!test_bit(i, iop->uptodate))
- uptodate = false;
- }
-
- if (uptodate)
+ bitmap_set(iop->uptodate, first, last - first + 1);
+ if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
SetPageUptodate(page);
spin_unlock_irqrestore(&iop->uptodate_lock, flags);
}
@@ -172,13 +170,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
}
static void
-iomap_read_finish(struct iomap_page *iop, struct page *page)
-{
- if (!iop || atomic_dec_and_test(&iop->read_count))
- unlock_page(page);
-}
-
-static void
iomap_read_page_end_io(struct bio_vec *bvec, int error)
{
struct page *page = bvec->bv_page;
@@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error)
iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
}
- iomap_read_finish(iop, page);
+ if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending))
+ unlock_page(page);
}
static void
@@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
ctx->cur_page_in_bio = true;
+ if (iop)
+ atomic_add(plen, &iop->read_bytes_pending);
- /*
- * Try to merge into a previous segment if we can.
- */
+ /* Try to merge into a previous segment if we can */
sector = iomap_sector(iomap, pos);
- if (ctx->bio && bio_end_sector(ctx->bio) == sector)
+ if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff,
+ &same_page))
+ goto done;
is_contig = true;
-
- if (is_contig &&
- __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) {
- if (!same_page && iop)
- atomic_inc(&iop->read_count);
- goto done;
}
- /*
- * If we start a new segment we need to increase the read count, and we
- * need to do so before submitting any previous full bio to make sure
- * that we don't prematurely unlock the page.
- */
- if (iop)
- atomic_inc(&iop->read_count);
-
- if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
+ if (!is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
{
struct iomap_page *iop = iomap_page_create(inode, page);
loff_t block_size = i_blocksize(inode);
- loff_t block_start = pos & ~(block_size - 1);
- loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+ loff_t block_start = round_down(pos, block_size);
+ loff_t block_end = round_up(pos + len, block_size);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
- int status;
if (PageUptodate(page))
return 0;
+ ClearPageError(page);
do {
iomap_adjust_read_range(inode, iop, &block_start,
@@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
return -EIO;
zero_user_segments(page, poff, from, to, poff + plen);
- iomap_set_range_uptodate(page, poff, plen);
- continue;
+ } else {
+ int status = iomap_read_page_sync(block_start, page,
+ poff, plen, srcmap);
+ if (status)
+ return status;
}
-
- status = iomap_read_page_sync(block_start, page, poff, plen,
- srcmap);
- if (status)
- return status;
+ iomap_set_range_uptodate(page, poff, plen);
} while ((block_start += plen) < block_end);
return 0;
@@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page)
}
EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
-static int
-__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+ size_t copied, struct page *page)
{
flush_dcache_page(page);
@@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
return copied;
}
-static int
-iomap_write_end_inline(struct inode *inode, struct page *page,
- struct iomap *iomap, loff_t pos, unsigned copied)
+static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
+ struct iomap *iomap, loff_t pos, size_t copied)
{
void *addr;
WARN_ON_ONCE(!PageUptodate(page));
BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ flush_dcache_page(page);
addr = kmap_atomic(page);
memcpy(iomap->inline_data + pos, addr + pos, copied);
kunmap_atomic(addr);
@@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
return copied;
}
-static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
- struct page *page, struct iomap *iomap, struct iomap *srcmap)
+/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
+static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+ size_t copied, struct page *page, struct iomap *iomap,
+ struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
loff_t old_size = inode->i_size;
- int ret;
+ size_t ret;
if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
@@ -811,13 +791,8 @@ again:
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- flush_dcache_page(page);
-
- status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+ copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
srcmap);
- if (unlikely(status < 0))
- break;
- copied = status;
cond_resched();
@@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
srcmap);
- if (unlikely(status <= 0)) {
- if (WARN_ON_ONCE(status == 0))
- return -EIO;
- return status;
- }
+ if (WARN_ON_ONCE(status == 0))
+ return -EIO;
cond_resched();
@@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
+static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct page *page;
int status;
+ unsigned offset = offset_in_page(pos);
+ unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
if (status)
@@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
-static loff_t
-iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
+ loff_t length, void *data, struct iomap *iomap,
+ struct iomap *srcmap)
{
bool *did_zero = data;
loff_t written = 0;
- int status;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return count;
+ return length;
do {
- unsigned offset, bytes;
-
- offset = offset_in_page(pos);
- bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+ s64 bytes;
if (IS_DAX(inode))
- status = dax_iomap_zero(pos, offset, bytes, iomap);
+ bytes = dax_iomap_zero(pos, length, iomap);
else
- status = iomap_zero(inode, pos, offset, bytes, iomap,
- srcmap);
- if (status < 0)
- return status;
+ bytes = iomap_zero(inode, pos, length, iomap, srcmap);
+ if (bytes < 0)
+ return bytes;
pos += bytes;
- count -= bytes;
+ length -= bytes;
written += bytes;
if (did_zero)
*did_zero = true;
- } while (count > 0);
+ } while (length > 0);
return written;
}
@@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void
iomap_finish_page_writeback(struct inode *inode, struct page *page,
- int error)
+ int error, unsigned int len)
{
struct iomap_page *iop = to_iomap_page(page);
@@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page,
mapping_set_error(inode->i_mapping, -EIO);
}
- WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
+ WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
- if (!iop || atomic_dec_and_test(&iop->write_count))
+ if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
end_page_writeback(page);
}
@@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
/* walk each page on bio, ending page IO on them */
bio_for_each_segment_all(bv, bio, iter_all)
- iomap_finish_page_writeback(inode, bv->bv_page, error);
+ iomap_finish_page_writeback(inode, bv->bv_page, error,
+ bv->bv_len);
bio_put(bio);
}
/* The ioend has been freed by bio_put() */
@@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
&same_page);
- if (iop && !same_page)
- atomic_inc(&iop->write_count);
+ if (iop)
+ atomic_add(len, &iop->write_bytes_pending);
if (!merged) {
if (bio_full(wpc->ioend->io_bio, len)) {
@@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
- WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
+ WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
/*
* Walk through the page to find areas to write back. If we run off the
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c1aafb2ab990..933f234d5bec 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -76,7 +76,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
dio->submit.cookie = submit_bio(bio);
}
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
const struct iomap_dio_ops *dops = dio->dops;
struct kiocb *iocb = dio->iocb;
@@ -108,7 +108,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error &&
+ if (!dio->error && dio->size &&
(dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
int err;
err = invalidate_inode_pages2_range(inode->i_mapping,
@@ -118,6 +118,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}
+ inode_dio_end(file_inode(iocb->ki_filp));
/*
* If this is a DSYNC write, make sure we push it to stable storage now
* that we've written data.
@@ -125,11 +126,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
- inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
return ret;
}
+EXPORT_SYMBOL_GPL(iomap_dio_complete);
static void iomap_dio_complete_work(struct work_struct *work)
{
@@ -388,6 +389,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
case IOMAP_INLINE:
return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+ case IOMAP_DELALLOC:
+ /*
+ * DIO is not serialised against mmap() access at all, and so
+ * if the page_mkwrite occurs between the writeback and the
+ * iomap_apply() call in the DIO path, then it will see the
+ * DELALLOC block that the page-mkwrite allocated.
+ */
+ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
+ dio->iocb->ki_filp, current->comm);
+ return -EIO;
default:
WARN_ON_ONCE(1);
return -EIO;
@@ -406,8 +417,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+struct iomap_dio *
+__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion)
{
@@ -421,14 +432,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct iomap_dio *dio;
if (!count)
- return 0;
+ return NULL;
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
- return -EIO;
+ return ERR_PTR(-EIO);
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
dio->iocb = iocb;
atomic_set(&dio->ref, 1);
@@ -558,7 +569,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
if (!wait_for_completion)
- return -EIOCBQUEUED;
+ return ERR_PTR(-EIOCBQUEUED);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -574,10 +585,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
__set_current_state(TASK_RUNNING);
}
- return iomap_dio_complete(dio);
+ return dio;
out_free_dio:
kfree(dio);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__iomap_dio_rw);
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+ bool wait_for_completion)
+{
+ struct iomap_dio *dio;
+
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
+ if (IS_ERR_OR_NULL(dio))
+ return PTR_ERR_OR_ZERO(dio);
+ return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index a2f5338a5ea1..176580f54af9 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
- int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+ int blocks_per_page = i_blocks_per_page(inode, page);
sector_t page_start; /* address of page in fs blocks */
sector_t pblock;
int xlen;
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
new file mode 100644
index 000000000000..90d255fbdd9b
--- /dev/null
+++ b/fs/kernel_read_file.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/kernel_read_file.h>
+#include <linux/security.h>
+#include <linux/vmalloc.h>
+
+/**
+ * kernel_read_file() - read file contents into a kernel buffer
+ *
+ * @file file to read from
+ * @offset where to start reading from (see below).
+ * @buf pointer to a "void *" buffer for reading into (if
+ * *@buf is NULL, a buffer will be allocated, and
+ * @buf_size will be ignored)
+ * @buf_size size of buf, if already allocated. If @buf not
+ * allocated, this is the largest size to allocate.
+ * @file_size if non-NULL, the full size of @file will be
+ * written here.
+ * @id the kernel_read_file_id identifying the type of
+ * file contents being read (for LSMs to examine)
+ *
+ * @offset must be 0 unless both @buf and @file_size are non-NULL
+ * (i.e. the caller must be expecting to read partial file contents
+ * via an already-allocated @buf, in at most @buf_size chunks, and
+ * will be able to determine when the entire file was read by
+ * checking @file_size). This isn't a recommended way to read a
+ * file, though, since it is possible that the contents might
+ * change between calls to kernel_read_file().
+ *
+ * Returns number of bytes read (no single read will be bigger
+ * than INT_MAX), or negative on error.
+ *
+ */
+int kernel_read_file(struct file *file, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
+{
+ loff_t i_size, pos;
+ size_t copied;
+ void *allocated = NULL;
+ bool whole_file;
+ int ret;
+
+ if (offset != 0 && (!*buf || !file_size))
+ return -EINVAL;
+
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return -EINVAL;
+
+ ret = deny_write_access(file);
+ if (ret)
+ return ret;
+
+ i_size = i_size_read(file_inode(file));
+ if (i_size <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* The file is too big for sane activities. */
+ if (i_size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+ /* The entire file cannot be read in one buffer. */
+ if (!file_size && offset == 0 && i_size > buf_size) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ whole_file = (offset == 0 && i_size <= buf_size);
+ ret = security_kernel_read_file(file, id, whole_file);
+ if (ret)
+ goto out;
+
+ if (file_size)
+ *file_size = i_size;
+
+ if (!*buf)
+ *buf = allocated = vmalloc(i_size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = offset;
+ copied = 0;
+ while (copied < buf_size) {
+ ssize_t bytes;
+ size_t wanted = min_t(size_t, buf_size - copied,
+ i_size - pos);
+
+ bytes = kernel_read(file, *buf + copied, wanted, &pos);
+ if (bytes < 0) {
+ ret = bytes;
+ goto out_free;
+ }
+
+ if (bytes == 0)
+ break;
+ copied += bytes;
+ }
+
+ if (whole_file) {
+ if (pos != i_size) {
+ ret = -EIO;
+ goto out_free;
+ }
+
+ ret = security_kernel_post_read_file(file, *buf, i_size, id);
+ }
+
+out_free:
+ if (ret < 0) {
+ if (allocated) {
+ vfree(*buf);
+ *buf = NULL;
+ }
+ }
+
+out:
+ allow_write_access(file);
+ return ret == 0 ? copied : ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file);
+
+int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
+{
+ struct file *file;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ file = filp_open(path, O_RDONLY, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, offset, buf, buf_size, file_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+
+int kernel_read_file_from_path_initns(const char *path, loff_t offset,
+ void **buf, size_t buf_size,
+ size_t *file_size,
+ enum kernel_read_file_id id)
+{
+ struct file *file;
+ struct path root;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+
+ file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+ path_put(&root);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, offset, buf, buf_size, file_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
+
+int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
+{
+ struct fd f = fdget(fd);
+ int ret = -EBADF;
+
+ if (!f.file)
+ goto out;
+
+ ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
+out:
+ fdput(f);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
diff --git a/fs/libfs.c b/fs/libfs.c
index e0d42e977d9a..fc34361c1489 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,6 +20,8 @@
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
+#include <linux/unicode.h>
+#include <linux/fscrypt.h>
#include <linux/uaccess.h>
@@ -1363,3 +1365,88 @@ bool is_empty_dir_inode(struct inode *inode)
return (inode->i_fop == &empty_dir_operations) &&
(inode->i_op == &empty_dir_inode_operations);
}
+
+#ifdef CONFIG_UNICODE
+/*
+ * Determine if the name of a dentry should be casefolded.
+ *
+ * Return: if names will need casefolding
+ */
+static bool needs_casefold(const struct inode *dir)
+{
+ return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
+}
+
+/**
+ * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
+ * @dentry: dentry whose name we are checking against
+ * @len: len of name of dentry
+ * @str: str pointer to name of dentry
+ * @name: Name to compare against
+ *
+ * Return: 0 if names match, 1 if mismatch, or -ERRNO
+ */
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
+{
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *dir = READ_ONCE(parent->d_inode);
+ const struct super_block *sb = dentry->d_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ struct qstr qstr = QSTR_INIT(str, len);
+ char strbuf[DNAME_INLINE_LEN];
+ int ret;
+
+ if (!dir || !needs_casefold(dir))
+ goto fallback;
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+ ret = utf8_strncasecmp(um, name, &qstr);
+ if (ret >= 0)
+ return ret;
+
+ if (sb_has_strict_encoding(sb))
+ return -EINVAL;
+fallback:
+ if (len != name->len)
+ return 1;
+ return !!memcmp(str, name->name, len);
+}
+EXPORT_SYMBOL(generic_ci_d_compare);
+
+/**
+ * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
+ * @dentry: dentry of the parent directory
+ * @str: qstr of name whose hash we should fill in
+ *
+ * Return: 0 if hash was successful or unchanged, and -EINVAL on error
+ */
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+{
+ const struct inode *dir = READ_ONCE(dentry->d_inode);
+ struct super_block *sb = dentry->d_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ int ret = 0;
+
+ if (!dir || !needs_casefold(dir))
+ return 0;
+
+ ret = utf8_casefold_hash(um, dentry, str);
+ if (ret < 0 && sb_has_strict_encoding(sb))
+ return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL(generic_ci_d_hash);
+#endif
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1eabd91870e6..1d9488cf0534 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -417,7 +417,7 @@ void nsm_release(struct nsm_handle *nsm)
/*
* XDR functions for NSM.
*
- * See http://www.opengroup.org/ for details on the Network
+ * See https://www.opengroup.org/ for details on the Network
* Status Monitor wire protocol.
*/
diff --git a/fs/namei.c b/fs/namei.c
index e99e2a9da0f7..f1eb8ccd2be9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
struct super_block *sb = mnt->mnt_sb;
- /* Bind mounts and multi-root filesystems can have disconnected paths */
- if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
+ /* Bind mounts can have disconnected paths */
+ if (mnt->mnt_root == sb->s_root)
return true;
return is_subdir(dentry, mnt->mnt_root);
diff --git a/fs/namespace.c b/fs/namespace.c
index bae0e95b3713..294e05a13d17 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt)
}
}
-void *copy_mount_options(const void __user * data)
+static void *copy_mount_options(const void __user * data)
{
char *copy;
- unsigned size;
+ unsigned left, offset;
if (!data)
return NULL;
@@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data)
if (!copy)
return ERR_PTR(-ENOMEM);
- size = PAGE_SIZE - offset_in_page(data);
+ left = copy_from_user(copy, data, PAGE_SIZE);
- if (copy_from_user(copy, data, size)) {
+ /*
+ * Not all architectures have an exact copy_from_user(). Resort to
+ * byte at a time.
+ */
+ offset = PAGE_SIZE - left;
+ while (left) {
+ char c;
+ if (get_user(c, (const char __user *)data + offset))
+ break;
+ copy[offset] = c;
+ left--;
+ offset++;
+ }
+
+ if (left == PAGE_SIZE) {
kfree(copy);
return ERR_PTR(-EFAULT);
}
- if (size != PAGE_SIZE) {
- if (copy_from_user(copy + size, data + size, PAGE_SIZE - size))
- memset(copy + size, 0, PAGE_SIZE - size);
- }
+
return copy;
}
-char *copy_mount_string(const void __user *data)
+static char *copy_mount_string(const void __user *data)
{
return data ? strndup_user(data, PATH_MAX) : NULL;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e732580fe47b..cb52db9a0cfb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do {
+ if (entry->label)
+ entry->label->len = NFS4_MAXLABELLEN;
+
status = xdr_decode(desc, entry, &stream);
if (status != 0) {
if (status == -EAGAIN)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ff8965d1a4d4..a163533446fa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx,
+ u32 start_idx, u32 *best_idx,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
- int idx;
+ u32 idx;
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
@@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
struct nfs4_pnfs_ds *ds;
@@ -791,7 +791,8 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx)
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ u32 *best_idx)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
@@ -837,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- int ds_idx;
+ u32 ds_idx, i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -863,14 +864,14 @@ retry:
goto retry;
}
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ pgm = &pgio->pg_mirrors[i];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ }
pgio->pg_mirror_idx = ds_idx;
- /* read always uses only one mirror - idx 0 for pgio layer */
- pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
-
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
@@ -894,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- int i;
+ u32 i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -1038,7 +1039,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
- int new_idx = 0;
+ u32 new_idx = 0;
if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx))
ff_layout_send_layouterror(hdr->lseg);
@@ -1075,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
@@ -1149,7 +1150,7 @@ reset:
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1184,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
@@ -1211,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- int idx, u64 offset, u64 length,
+ u32 idx, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1809,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
- int idx = hdr->pgio_mirror_idx;
+ u32 idx = hdr->pgio_mirror_idx;
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 524812984e2d..29ec8b09a52d 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -94,6 +94,7 @@ enum {
static const struct constant_table nfs_param_enums_local_lock[] = {
{ "all", Opt_local_lock_all },
{ "flock", Opt_local_lock_flock },
+ { "posix", Opt_local_lock_posix },
{ "none", Opt_local_lock_none },
{}
};
@@ -1039,6 +1040,65 @@ out_invalid_fh:
}
#if IS_ENABLED(CONFIG_NFS_V4)
+struct compat_nfs_string {
+ compat_uint_t len;
+ compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+ struct compat_nfs_string *src)
+{
+ dst->data = compat_ptr(src->data);
+ dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+ compat_int_t version;
+ compat_int_t flags;
+ compat_int_t rsize;
+ compat_int_t wsize;
+ compat_int_t timeo;
+ compat_int_t retrans;
+ compat_int_t acregmin;
+ compat_int_t acregmax;
+ compat_int_t acdirmin;
+ compat_int_t acdirmax;
+ struct compat_nfs_string client_addr;
+ struct compat_nfs_string mnt_path;
+ struct compat_nfs_string hostname;
+ compat_uint_t host_addrlen;
+ compat_uptr_t host_addr;
+ compat_int_t proto;
+ compat_int_t auth_flavourlen;
+ compat_uptr_t auth_flavours;
+};
+
+static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data)
+{
+ struct compat_nfs4_mount_data_v1 *compat =
+ (struct compat_nfs4_mount_data_v1 *)data;
+
+ /* copy the fields backwards */
+ data->auth_flavours = compat_ptr(compat->auth_flavours);
+ data->auth_flavourlen = compat->auth_flavourlen;
+ data->proto = compat->proto;
+ data->host_addr = compat_ptr(compat->host_addr);
+ data->host_addrlen = compat->host_addrlen;
+ compat_nfs_string(&data->hostname, &compat->hostname);
+ compat_nfs_string(&data->mnt_path, &compat->mnt_path);
+ compat_nfs_string(&data->client_addr, &compat->client_addr);
+ data->acdirmax = compat->acdirmax;
+ data->acdirmin = compat->acdirmin;
+ data->acregmax = compat->acregmax;
+ data->acregmin = compat->acregmin;
+ data->retrans = compat->retrans;
+ data->timeo = compat->timeo;
+ data->wsize = compat->wsize;
+ data->rsize = compat->rsize;
+ data->flags = compat->flags;
+ data->version = compat->version;
+}
+
/*
* Validate NFSv4 mount options
*/
@@ -1049,89 +1109,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
char *c;
- if (data == NULL)
- goto out_no_data;
+ if (!data) {
+ if (is_remount_fc(fc))
+ goto done;
+ return nfs_invalf(fc,
+ "NFS4: mount program didn't pass any mount data");
+ }
ctx->version = 4;
- switch (data->version) {
- case 1:
- if (data->host_addrlen > sizeof(ctx->nfs_server.address))
- goto out_no_address;
- if (data->host_addrlen == 0)
- goto out_no_address;
- ctx->nfs_server.addrlen = data->host_addrlen;
- if (copy_from_user(sap, data->host_addr, data->host_addrlen))
- return -EFAULT;
- if (!nfs_verify_server_address(sap))
- goto out_no_address;
- ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
-
- if (data->auth_flavourlen) {
- rpc_authflavor_t pseudoflavor;
- if (data->auth_flavourlen > 1)
- goto out_inval_auth;
- if (copy_from_user(&pseudoflavor,
- data->auth_flavours,
- sizeof(pseudoflavor)))
- return -EFAULT;
- ctx->selected_flavor = pseudoflavor;
- } else
- ctx->selected_flavor = RPC_AUTH_UNIX;
-
- c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.hostname = c;
+ if (data->version != 1)
+ return generic_parse_monolithic(fc, data);
- c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.export_path = c;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+ if (in_compat_syscall())
+ nfs4_compat_mount_data_conv(data);
- c = strndup_user(data->client_addr.data, 16);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->client_address = c;
-
- /*
- * Translate to nfs_fs_context, which nfs_fill_super
- * can deal with.
- */
+ if (data->host_addrlen > sizeof(ctx->nfs_server.address))
+ goto out_no_address;
+ if (data->host_addrlen == 0)
+ goto out_no_address;
+ ctx->nfs_server.addrlen = data->host_addrlen;
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+ return -EFAULT;
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+ ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
- ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
- ctx->rsize = data->rsize;
- ctx->wsize = data->wsize;
- ctx->timeo = data->timeo;
- ctx->retrans = data->retrans;
- ctx->acregmin = data->acregmin;
- ctx->acregmax = data->acregmax;
- ctx->acdirmin = data->acdirmin;
- ctx->acdirmax = data->acdirmax;
- ctx->nfs_server.protocol = data->proto;
- nfs_validate_transport_protocol(ctx);
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
+ if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
- break;
- default:
- goto generic;
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
+ if (copy_from_user(&pseudoflavor, data->auth_flavours,
+ sizeof(pseudoflavor)))
+ return -EFAULT;
+ ctx->selected_flavor = pseudoflavor;
+ } else {
+ ctx->selected_flavor = RPC_AUTH_UNIX;
}
+ c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.hostname = c;
+
+ c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.export_path = c;
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+ c = strndup_user(data->client_addr.data, 16);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->client_address = c;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+
+ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+done:
ctx->skip_reconfig_option_check = true;
return 0;
-generic:
- return generic_parse_monolithic(fc, data);
-
-out_no_data:
- if (is_remount_fc(fc)) {
- ctx->skip_reconfig_option_check = true;
- return 0;
- }
- return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data");
-
out_inval_auth:
return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d",
data->auth_flavourlen);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 6b063227e34e..2bcbe38afe2e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -32,9 +32,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
/*
* nfs_path - reconstruct the path given an arbitrary dentry
* @base - used to return pointer to the end of devname part of path
- * @dentry - pointer to dentry
+ * @dentry_in - pointer to dentry
* @buffer - result buffer
- * @buflen - length of buffer
+ * @buflen_in - length of buffer
* @flags - options (see below)
*
* Helper function for constructing the server pathname
@@ -49,15 +49,19 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
* the original device (export) name
* (if unset, the original name is returned verbatim)
*/
-char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
- unsigned flags)
+char *nfs_path(char **p, struct dentry *dentry_in, char *buffer,
+ ssize_t buflen_in, unsigned flags)
{
char *end;
int namelen;
unsigned seq;
const char *base;
+ struct dentry *dentry;
+ ssize_t buflen;
rename_retry:
+ buflen = buflen_in;
+ dentry = dentry_in;
end = buffer+buflen;
*--end = '\0';
buflen--;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 142225f0af59..2b2211d1234e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -356,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src,
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
-
+ spin_lock(&dst_inode->i_lock);
+ NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA);
+ spin_unlock(&dst_inode->i_lock);
+ spin_lock(&src_inode->i_lock);
+ NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME);
+ spin_unlock(&src_inode->i_lock);
status = res->write_res.count;
out:
if (args->sync)
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 86777996cfec..b51424ff8159 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -67,7 +67,6 @@ struct nfs4_xattr_bucket {
struct nfs4_xattr_cache {
struct kref ref;
- spinlock_t hash_lock; /* protects hashtable and lru */
struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
struct list_head lru;
struct list_head dispose;
@@ -882,7 +881,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
unsigned long count;
- count = list_lru_count(&nfs4_xattr_cache_lru);
+ count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc);
return vfs_pressure_ratio(count);
}
@@ -976,7 +975,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- count = list_lru_count(lru);
+ count = list_lru_shrink_count(lru, sc);
return vfs_pressure_ratio(count);
}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index cc50085e151c..0dc31ad2362e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -45,6 +45,15 @@
#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
+#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 2 /* data_info4.di_length */)
+#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
+ 1 /* rpr_eof */ + \
+ 1 /* rpr_contents count */ + \
+ 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -128,6 +137,14 @@
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_plus_maxsz)
+#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_plus_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -324,6 +341,16 @@ static void encode_deallocate(struct xdr_stream *xdr,
encode_fallocate(xdr, args);
}
+static void encode_read_plus(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint64(xdr, args->offset);
+ encode_uint32(xdr, args->count);
+}
+
static void encode_seek(struct xdr_stream *xdr,
const struct nfs42_seek_args *args,
struct compound_hdr *hdr)
@@ -723,6 +750,28 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
}
/*
+ * Encode READ_PLUS request
+ */
+static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read_plus(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
* Encode SEEK request
*/
static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
@@ -970,6 +1019,97 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
return decode_op_hdr(xdr, OP_DEALLOCATE);
}
+static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+ uint32_t *eof)
+{
+ uint32_t count, recvd;
+ uint64_t offset;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &offset);
+ count = be32_to_cpup(p);
+ recvd = xdr_align_data(xdr, res->count, count);
+ res->count += recvd;
+
+ if (count > recvd) {
+ dprintk("NFS: server cheating in read reply: "
+ "count %u > recvd %u\n", count, recvd);
+ *eof = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+ uint32_t *eof)
+{
+ uint64_t offset, length, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &offset);
+ p = xdr_decode_hyper(p, &length);
+ recvd = xdr_expand_hole(xdr, res->count, length);
+ res->count += recvd;
+
+ if (recvd < length) {
+ *eof = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ uint32_t eof, segments, type;
+ int status, i;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_READ_PLUS);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ eof = be32_to_cpup(p++);
+ segments = be32_to_cpup(p++);
+ if (segments == 0)
+ goto out;
+
+ for (i = 0; i < segments; i++) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ type = be32_to_cpup(p++);
+ if (type == NFS4_CONTENT_DATA)
+ status = decode_read_plus_data(xdr, res, &eof);
+ else if (type == NFS4_CONTENT_HOLE)
+ status = decode_read_plus_hole(xdr, res, &eof);
+ else
+ return -EINVAL;
+
+ if (status < 0)
+ return status;
+ if (status > 0)
+ break;
+ }
+
+out:
+ res->eof = eof;
+ return 0;
+}
+
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
{
int status;
@@ -1147,6 +1287,33 @@ out:
}
/*
+ * Decode READ_PLUS request
+ */
+static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read_plus(xdr, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
* Decode SEEK request
*/
static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0c9505dc852c..065cb04222a1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -599,6 +599,14 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat
return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
}
+static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ u32 seq1 = be32_to_cpu(s1->seqid);
+ u32 seq2 = be32_to_cpu(s2->seqid);
+
+ return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU);
+}
+
static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src)
{
return nfs4_stateid_match_other(dst, src) &&
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index daacc78a3d48..be7915c861ce 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1045,6 +1045,8 @@ static int nfs4_server_common_setup(struct nfs_server *server,
server->caps |= server->nfs_client->cl_mvops->init_caps;
if (server->flags & NFS_MOUNT_NORDIRPLUS)
server->caps &= ~NFS_CAP_READDIRPLUS;
+ if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
+ server->caps &= ~NFS_CAP_READ_PLUS;
/*
* Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
* authentication.
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 984938024011..9d354de613da 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -146,7 +146,8 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
/* Only offload copy if superblock is the same */
if (file_in->f_op != &nfs4_file_operations)
return -EXDEV;
- if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) ||
+ !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY))
return -EOPNOTSUPP;
if (file_inode(file_in) == file_inode(file_out))
return -EOPNOTSUPP;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 62e6eea5c516..8d8aba305ecc 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -46,6 +46,7 @@
#include <keys/user-type.h>
#include <keys/request_key_auth-type.h>
#include <linux/module.h>
+#include <linux/user_namespace.h>
#include "internal.h"
#include "netns.h"
@@ -69,13 +70,13 @@ struct idmap {
struct rpc_pipe *idmap_pipe;
struct idmap_legacy_upcalldata *idmap_upcall_data;
struct mutex idmap_mutex;
- const struct cred *cred;
+ struct user_namespace *user_ns;
};
static struct user_namespace *idmap_userns(const struct idmap *idmap)
{
- if (idmap && idmap->cred)
- return idmap->cred->user_ns;
+ if (idmap && idmap->user_ns)
+ return idmap->user_ns;
return &init_user_ns;
}
@@ -286,7 +287,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
if (ret < 0)
return ERR_PTR(ret);
- if (!idmap->cred || idmap->cred->user_ns == &init_user_ns)
+ if (!idmap->user_ns || idmap->user_ns == &init_user_ns)
rkey = request_key(&key_type_id_resolver, desc, "");
if (IS_ERR(rkey)) {
mutex_lock(&idmap->idmap_mutex);
@@ -462,7 +463,7 @@ nfs_idmap_new(struct nfs_client *clp)
return -ENOMEM;
mutex_init(&idmap->idmap_mutex);
- idmap->cred = get_cred(clp->cl_rpcclient->cl_cred);
+ idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns);
rpc_init_pipe_dir_object(&idmap->idmap_pdo,
&nfs_idmap_pipe_dir_object_ops,
@@ -486,7 +487,7 @@ nfs_idmap_new(struct nfs_client *clp)
err_destroy_pipe:
rpc_destroy_pipe_data(idmap->idmap_pipe);
err:
- put_cred(idmap->cred);
+ get_user_ns(idmap->user_ns);
kfree(idmap);
return error;
}
@@ -503,7 +504,7 @@ nfs_idmap_delete(struct nfs_client *clp)
&clp->cl_rpcclient->cl_pipedir_objects,
&idmap->idmap_pdo);
rpc_destroy_pipe_data(idmap->idmap_pipe);
- put_cred(idmap->cred);
+ put_user_ns(idmap->user_ns);
kfree(idmap);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e95c85fe395..9e0ca9b2b210 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -63,6 +63,7 @@
#include "callback.h"
#include "pnfs.h"
#include "netns.h"
+#include "sysfs.h"
#include "nfs4idmap.h"
#include "nfs4session.h"
#include "fscache.h"
@@ -70,6 +71,10 @@
#include "nfs4trace.h"
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif /* CONFIG_NFS_V4_2 */
+
#define NFSDBG_FACILITY NFSDBG_PROC
#define NFS4_BITMASK_SZ 3
@@ -107,6 +112,9 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
const struct cred *, bool);
#endif
+static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
+ struct nfs_server *server,
+ struct nfs4_label *label);
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static inline struct nfs4_label *
@@ -1547,19 +1555,6 @@ static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
wake_up_all(&state->waitq);
}
-static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state,
- const nfs4_stateid *stateid)
-{
- u32 state_seqid = be32_to_cpu(state->open_stateid.seqid);
- u32 stateid_seqid = be32_to_cpu(stateid->seqid);
-
- if (stateid_seqid == state_seqid + 1U ||
- (stateid_seqid == 1U && state_seqid == 0xffffffffU))
- nfs_state_log_update_open_stateid(state);
- else
- set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
-}
-
static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
{
struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1585,21 +1580,19 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
* i.e. The stateid seqids have to be initialised to 1, and
* are then incremented on every state transition.
*/
-static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+static bool nfs_stateid_is_sequential(struct nfs4_state *state,
const nfs4_stateid *stateid)
{
- if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 ||
- !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ /* The common case - we're updating to a new sequence number */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ nfs4_stateid_is_next(&state->open_stateid, stateid)) {
+ return true;
+ }
+ } else {
+ /* This is the first OPEN in this generation */
if (stateid->seqid == cpu_to_be32(1))
- nfs_state_log_update_open_stateid(state);
- else
- set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
- return true;
- }
-
- if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
- nfs_state_log_out_of_order_open_stateid(state, stateid);
- return true;
+ return true;
}
return false;
}
@@ -1673,16 +1666,16 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
int status = 0;
for (;;) {
- if (!nfs_need_update_open_stateid(state, stateid))
- return;
- if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ if (nfs_stateid_is_sequential(state, stateid))
break;
+
if (status)
break;
/* Rely on seqids for serialisation with NFSv4.0 */
if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
break;
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
/*
* Ensure we process the state changes in the same order
@@ -1693,6 +1686,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
spin_unlock(&state->owner->so_lock);
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+
if (!signal_pending(current)) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
@@ -3435,7 +3429,8 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
__be32 seqid_open;
u32 dst_seqid;
bool ret;
- int seq;
+ int seq, status = -EAGAIN;
+ DEFINE_WAIT(wait);
for (;;) {
ret = false;
@@ -3447,15 +3442,41 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
continue;
break;
}
+
+ write_seqlock(&state->seqlock);
seqid_open = state->open_stateid.seqid;
- if (read_seqretry(&state->seqlock, seq))
- continue;
dst_seqid = be32_to_cpu(dst->seqid);
- if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
- dst->seqid = cpu_to_be32(dst_seqid + 1);
- else
+
+ /* Did another OPEN bump the state's seqid? try again: */
+ if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) {
dst->seqid = seqid_open;
+ write_sequnlock(&state->seqlock);
+ ret = true;
+ break;
+ }
+
+ /* server says we're behind but we haven't seen the update yet */
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ write_sequnlock(&state->seqlock);
+ trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
+
+ if (signal_pending(current))
+ status = -EINTR;
+ else
+ if (schedule_timeout(5*HZ) != 0)
+ status = 0;
+
+ finish_wait(&state->waitq, &wait);
+
+ if (!status)
+ continue;
+ if (status == -EINTR)
+ break;
+
+ /* we slept the whole 5 seconds, we must have lost a seqid */
+ dst->seqid = cpu_to_be32(dst_seqid + 1);
ret = true;
break;
}
@@ -3632,9 +3653,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
/* Close-to-open cache consistency revalidation */
- if (!nfs4_have_delegation(inode, FMODE_READ))
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
- else
+ nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL);
+ } else
calldata->arg.bitmask = NULL;
}
@@ -5255,28 +5277,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+static bool nfs4_read_plus_not_supported(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+ struct rpc_message *msg = &task->tk_msg;
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
+ server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ server->caps &= ~NFS_CAP_READ_PLUS;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ rpc_restart_call_prepare(task);
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
dprintk("--> %s\n", __func__);
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
return -EAGAIN;
+ if (nfs4_read_plus_not_supported(task, hdr))
+ return -EAGAIN;
if (task->tk_status > 0)
nfs_invalidate_atime(hdr->inode);
return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
nfs4_read_done_cb(task, hdr);
}
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ if (server->caps & NFS_CAP_READ_PLUS)
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+ else
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#else
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#endif /* CONFIG_NFS_V4_2 */
+
static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
hdr->timestamp = jiffies;
if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_read_done_cb;
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
}
@@ -5360,6 +5414,38 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
+static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
+ struct nfs_server *server,
+ struct nfs4_label *label)
+{
+
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ if ((cache_validity & NFS_INO_INVALID_DATA) ||
+ (cache_validity & NFS_INO_REVAL_PAGECACHE) ||
+ (cache_validity & NFS_INO_REVAL_FORCED) ||
+ (cache_validity & NFS_INO_INVALID_OTHER))
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+
+ if (cache_validity & NFS_INO_INVALID_ATIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+ if (cache_validity & NFS_INO_INVALID_ACCESS)
+ bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
+ FATTR4_WORD1_OWNER_GROUP;
+ if (cache_validity & NFS_INO_INVALID_ACL)
+ bitmask[0] |= FATTR4_WORD0_ACL;
+ if (cache_validity & NFS_INO_INVALID_LABEL)
+ bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ if (cache_validity & NFS_INO_INVALID_CTIME)
+ bitmask[0] |= FATTR4_WORD0_CHANGE;
+ if (cache_validity & NFS_INO_INVALID_MTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
+ if (cache_validity & NFS_INO_INVALID_SIZE)
+ bitmask[0] |= FATTR4_WORD0_SIZE;
+ if (cache_validity & NFS_INO_INVALID_BLOCKS)
+ bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+}
+
static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg,
struct rpc_clnt **clnt)
@@ -5369,8 +5455,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
if (!nfs4_write_need_cache_consistency_data(hdr)) {
hdr->args.bitmask = NULL;
hdr->res.fattr = NULL;
- } else
+ } else {
hdr->args.bitmask = server->cache_consistency_bitmask;
+ nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL);
+ }
if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_write_done_cb;
@@ -6006,9 +6094,34 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
+static size_t
+nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ struct nfs_netns_client *nn_clp = nn->nfs_client;
+ const char *id;
+
+ buf[0] = '\0';
+
+ if (nn_clp) {
+ rcu_read_lock();
+ id = rcu_dereference(nn_clp->identifier);
+ if (id)
+ strscpy(buf, id, buflen);
+ rcu_read_unlock();
+ }
+
+ if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0')
+ strscpy(buf, nfs4_client_id_uniquifier, buflen);
+
+ return strlen(buf);
+}
+
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
size_t len;
char *str;
@@ -6022,8 +6135,11 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1;
rcu_read_unlock();
- if (nfs4_client_id_uniquifier[0] != '\0')
- len += strlen(nfs4_client_id_uniquifier) + 1;
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
@@ -6037,10 +6153,9 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
return -ENOMEM;
rcu_read_lock();
- if (nfs4_client_id_uniquifier[0] != '\0')
+ if (buflen)
scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s",
- clp->cl_rpcclient->cl_nodename,
- nfs4_client_id_uniquifier,
+ clp->cl_rpcclient->cl_nodename, buf,
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_ADDR));
else
@@ -6055,50 +6170,23 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
}
static int
-nfs4_init_uniquifier_client_string(struct nfs_client *clp)
-{
- size_t len;
- char *str;
-
- len = 10 + 10 + 1 + 10 + 1 +
- strlen(nfs4_client_id_uniquifier) + 1 +
- strlen(clp->cl_rpcclient->cl_nodename) + 1;
-
- if (len > NFS4_OPAQUE_LIMIT + 1)
- return -EINVAL;
-
- /*
- * Since this string is allocated at mount time, and held until the
- * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
- * about a memory-reclaim deadlock.
- */
- str = kmalloc(len, GFP_KERNEL);
- if (!str)
- return -ENOMEM;
-
- scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
- clp->rpc_ops->version, clp->cl_minorversion,
- nfs4_client_id_uniquifier,
- clp->cl_rpcclient->cl_nodename);
- clp->cl_owner_id = str;
- return 0;
-}
-
-static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
size_t len;
char *str;
if (clp->cl_owner_id != NULL)
return 0;
- if (nfs4_client_id_uniquifier[0] != '\0')
- return nfs4_init_uniquifier_client_string(clp);
-
len = 10 + 10 + 1 + 10 + 1 +
strlen(clp->cl_rpcclient->cl_nodename) + 1;
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
@@ -6111,9 +6199,14 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- scnprintf(str, len, "Linux NFSv%u.%u %s",
- clp->rpc_ops->version, clp->cl_minorversion,
- clp->cl_rpcclient->cl_nodename);
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ buf, clp->cl_rpcclient->cl_nodename);
+ else
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ clp->cl_rpcclient->cl_nodename);
clp->cl_owner_id = str;
return 0;
}
@@ -6406,6 +6499,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
+ nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL);
nfs_copy_fh(&data->fh, NFS_FH(inode));
nfs4_stateid_copy(&data->stateid, stateid);
data->res.fattr = &data->fattr;
@@ -7440,7 +7534,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
len = security_inode_listsecurity(inode, list, list_len);
- if (list_len && len > list_len)
+ if (len >= 0 && list_len && len > list_len)
return -ERANGE;
}
return len;
@@ -8039,9 +8133,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
* both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
* DS flags set.
*/
-static int nfs4_check_cl_exchange_flags(u32 flags)
+static int nfs4_check_cl_exchange_flags(u32 flags, u32 version)
{
- if (flags & ~EXCHGID4_FLAG_MASK_R)
+ if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R))
+ goto out_inval;
+ else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R))
goto out_inval;
if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
(flags & EXCHGID4_FLAG_USE_NON_PNFS))
@@ -8454,7 +8550,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
if (status != 0)
goto out;
- status = nfs4_check_cl_exchange_flags(resp->flags);
+ status = nfs4_check_cl_exchange_flags(resp->flags,
+ clp->cl_mvops->minor_version);
if (status != 0)
goto out;
@@ -9693,7 +9790,6 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct rpc_clnt *clnt = server->client;
struct nfs4_call_sync_data data = {
.seq_server = server,
.seq_args = &args.seq_args,
@@ -9710,8 +9806,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
if (use_integrity) {
- clnt = server->nfs_client->cl_rpcclient;
- task_setup.rpc_client = clnt;
+ task_setup.rpc_client = server->nfs_client->cl_rpcclient;
cred = nfs4_get_clid_cred(server->nfs_client);
msg.rpc_cred = cred;
@@ -10165,7 +10260,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
- | NFS_CAP_LAYOUTERROR,
+ | NFS_CAP_LAYOUTERROR
+ | NFS_CAP_READ_PLUS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b4f852d4d099..484c1da96dea 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1511,6 +1511,7 @@ DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0b3510f62623..c6dbfcae7517 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5308,7 +5308,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
uint32_t attrlen,
bitmap[3] = {0};
int status;
- unsigned int pg_offset;
res->acl_len = 0;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -5316,9 +5315,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
xdr_enter_page(xdr, xdr->buf->page_len);
- /* Calculate the offset of the page data */
- pg_offset = xdr->buf->head[0].iov_len;
-
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
goto out;
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -5331,7 +5327,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
/* The bitmap (xdr len + bitmaps) and the attr xdr len words
* are stored with the acl data to handle the problem of
* variable length bitmaps.*/
- res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
+ res->acl_data_offset = xdr_page_pos(xdr);
res->acl_len = attrlen;
/* Check for receive buffer overflow */
@@ -7619,6 +7615,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(SETXATTR, enc_setxattr, dec_setxattr),
PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
+ PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 71f7741126b6..0e50b9d45c32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -902,7 +902,7 @@ restart:
}
/*
- * Called by the state manger to remove all layouts established under an
+ * Called by the state manager to remove all layouts established under an
* expired lease.
*/
void
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f7dad8227a5f..4034102010f0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -906,7 +906,7 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
default:
if (rpcauth_get_gssinfo(flavor, &info) != 0)
continue;
- /* Fallthrough */
+ break;
}
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
ctx->selected_flavor = flavor;
@@ -1217,13 +1217,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static void nfs_set_readahead(struct backing_dev_info *bdi,
- unsigned long iomax_pages)
-{
- bdi->ra_pages = VM_READAHEAD_PAGES;
- bdi->io_pages = iomax_pages;
-}
-
int nfs_get_tree_common(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
@@ -1268,7 +1261,7 @@ int nfs_get_tree_common(struct fs_context *fc)
MINOR(server->s_dev));
if (error)
goto error_splat_super;
- nfs_set_readahead(s->s_bdi, server->rpages);
+ s->s_bdi->io_pages = server->rpages;
server->super = s;
}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index c489496b5659..8cb70755e3c9 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -79,7 +79,12 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
struct nfs_netns_client *c = container_of(kobj,
struct nfs_netns_client,
kobject);
- return scnprintf(buf, PAGE_SIZE, "%s\n", c->identifier);
+ ssize_t ret;
+
+ rcu_read_lock();
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+ rcu_read_unlock();
+ return ret;
}
/* Strip trailing '\n' */
@@ -107,7 +112,7 @@ static ssize_t nfs_netns_identifier_store(struct kobject *kobj,
p = kmemdup_nul(buf, len, GFP_KERNEL);
if (!p)
return -ENOMEM;
- old = xchg(&c->identifier, p);
+ old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1);
if (old) {
synchronize_rcu();
kfree(old);
@@ -121,7 +126,7 @@ static void nfs_netns_client_release(struct kobject *kobj)
struct nfs_netns_client,
kobject);
- kfree(c->identifier);
+ kfree(rcu_dereference_raw(c->identifier));
kfree(c);
}
diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
index ebcbdc40483b..5501ef573c32 100644
--- a/fs/nfs/sysfs.h
+++ b/fs/nfs/sysfs.h
@@ -11,7 +11,7 @@
struct nfs_netns_client {
struct kobject kobject;
struct net *net;
- const char *identifier;
+ const char __rcu *identifier;
};
extern struct kobject *nfs_client_kobj;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 311e5ce80cfc..a07c39c94bbd 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -170,7 +170,7 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
+ if (bdev_is_partition(sb->s_bdev))
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
}
@@ -382,7 +382,7 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
+ if (bdev_is_partition(sb->s_bdev))
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
}
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index e516ae389ca5..5900879d5693 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
/**
* nilfs_bmap_assign - assign a new block number to a block
* @bmap: bmap
- * @bhp: pointer to buffer head
+ * @bh: pointer to buffer head
* @blocknr: block number
* @binfo: block information
*
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 86d4d850d130..025fb082575a 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
* nilfs_cpfile_change_cpmode - change checkpoint mode
* @cpfile: inode of checkpoint file
* @cno: checkpoint number
- * @status: mode of checkpoint
+ * @mode: mode of checkpoint
*
* Description: nilfs_change_cpmode() changes the mode of the checkpoint
* specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
@@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
/**
* nilfs_cpfile_get_stat - get checkpoint statistics
* @cpfile: inode of checkpoint file
- * @stat: pointer to a structure of checkpoint statistics
+ * @cpstat: pointer to a structure of checkpoint statistics
*
* Description: nilfs_cpfile_get_stat() returns information about checkpoints.
*
* Return Value: On success, 0 is returned, and checkpoints information is
- * stored in the place pointed by @stat. On error, one of the following
+ * stored in the place pointed by @cpstat. On error, one of the following
* negative error codes is returned.
*
* %-EIO - I/O error.
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b175f1330408..171fb5cd427f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
/**
* nilfs_forget_buffer - discard dirty state
- * @inode: owner inode of the buffer
* @bh: buffer head of the buffer to be discarded
*/
void nilfs_forget_buffer(struct buffer_head *bh)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 42ff67c0c14f..63722475e17e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
/**
* nilfs_sufile_get_stat - get segment usage statistics
* @sufile: inode of segment usage file
- * @stat: pointer to a structure of segment usage statistics
+ * @sustat: pointer to a structure of segment usage statistics
*
* Description: nilfs_sufile_get_stat() returns information about segment
* usage.
*
* Return Value: On success, 0 is returned, and segment usage information is
- * stored in the place pointed by @stat. On error, one of the following
+ * stored in the place pointed by @sustat. On error, one of the following
* negative error codes is returned.
*
* %-EIO - I/O error.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index c942910a8649..9167884a61ec 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ struct mem_cgroup *old_memcg;
struct inode *child = NULL;
bool name_event = false;
@@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
gfp |= __GFP_RETRY_MAYFAIL;
/* Whoever is interested in the event, pays for the allocation. */
- memalloc_use_memcg(group->memcg);
+ old_memcg = set_active_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
@@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
event->pid = get_pid(task_tgid(current));
out:
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
return event;
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index a65cf8c9f600..9ddcbadc98e2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
+ struct mem_cgroup *old_memcg;
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
@@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
* trigger OOM killer in the target monitoring memcg as it may have
* security repercussion.
*/
- memalloc_use_memcg(group->memcg);
+ old_memcg = set_active_memcg(group->memcg);
event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (unlikely(!event)) {
/*
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9bb9f0952b18..caf563981532 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi)
brelse(bh);
}
+ if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
+ ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
+ le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
+ goto err_out;
+ }
+
/* Apply the mst fixups. */
if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
/* FIXME: Try to use the $MFTMirr now. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4c1b90442d6f..78710788c237 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ /* Appending truncate log(TA) and flushing truncate log(TF) are
* two separated transactions. They can be both committed but not
* checkpointed. If crash occurs then, both two transaction will be
* replayed with several already released to global bitmap clusters.
@@ -7654,8 +7654,10 @@ out_mutex:
* main_bm related locks for avoiding the current IO starve, then go to
* trim the next group
*/
- if (ret >= 0 && group <= last_group)
+ if (ret >= 0 && group <= last_group) {
+ cond_resched();
goto next_group;
+ }
out:
range->len = trimmed * sb->s_blocksize;
return ret;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 89d13e0705fe..0179a73a3fa2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
int sectsize;
char *p = (char *)page;
struct fd f;
- struct inode *inode;
ssize_t ret = -EINVAL;
int live_threshold;
@@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
reg->hr_block_bytes == 0)
goto out2;
- inode = igrab(f.file->f_mapping->host);
- if (inode == NULL)
+ if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- if (!S_ISBLK(inode->i_mode))
- goto out3;
-
- reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
- if (ret) {
+ reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
+ FMODE_WRITE | FMODE_READ, NULL);
+ if (IS_ERR(reg->hr_bdev)) {
+ ret = PTR_ERR(reg->hr_bdev);
reg->hr_bdev = NULL;
- goto out3;
+ goto out2;
}
- inode = NULL;
bdevname(reg->hr_bdev, reg->hr_dev_name);
@@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
config_item_name(&reg->hr_item), reg->hr_dev_name);
out3:
- iput(inode);
+ if (ret < 0) {
+ blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE);
+ reg->hr_bdev = NULL;
+ }
out2:
fdput(f);
out:
- if (ret < 0) {
- if (reg->hr_bdev) {
- blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- reg->hr_bdev = NULL;
- }
- }
return ret;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 720e9f94957e..fc8252a28cb1 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* Under certain conditions, the window slide code
* might have reduced the number of bits available or
- * disabled the the local alloc entirely. Re-check
+ * disabled the local alloc entirely. Re-check
* here and return -ENOSPC if necessary.
*/
status = -ENOSPC;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d07fb92b7253..955ecd4030f0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -43,7 +43,8 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
-int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
+ struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
@@ -81,7 +82,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
}
list_size -= slen;
- if (ovl_is_private_xattr(name))
+ if (ovl_is_private_xattr(sb, name))
continue;
retry:
size = vfs_getxattr(old, name, value, value_size);
@@ -128,7 +129,8 @@ out:
return error;
}
-static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
+ struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
@@ -218,7 +220,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
len -= bytes;
}
out:
- if (!error)
+ if (!error && ovl_should_sync(ofs))
error = vfs_fsync(new_file, 0);
fput(new_file);
out_fput:
@@ -354,7 +356,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
}
/* Store file handle of @upper dir in @index dir entry */
-static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
+static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
+ struct dentry *index)
{
const struct ovl_fh *fh;
int err;
@@ -363,7 +366,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
if (IS_ERR(fh))
return PTR_ERR(fh);
- err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0);
+ err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
kfree(fh);
return err;
@@ -408,7 +411,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (IS_ERR(temp))
goto free_name;
- err = ovl_set_upper_fh(upper, temp);
+ err = ovl_set_upper_fh(OVL_FS(dentry->d_sb), upper, temp);
if (err)
goto out;
@@ -484,6 +487,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
int err;
/*
@@ -499,12 +503,13 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
upperpath.dentry = temp;
ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+ err = ovl_copy_up_data(ofs, &datapath, &upperpath,
+ c->stat.size);
if (err)
return err;
}
- err = ovl_copy_xattr(c->lowerpath.dentry, temp);
+ err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp);
if (err)
return err;
@@ -781,9 +786,33 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
return true;
}
+static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
+{
+ ssize_t res;
+ char *buf;
+
+ res = vfs_getxattr(dentry, name, NULL, 0);
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ res = 0;
+
+ if (res > 0) {
+ buf = kzalloc(res, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ res = vfs_getxattr(dentry, name, buf, res);
+ if (res < 0)
+ kfree(buf);
+ else
+ *value = buf;
+ }
+ return res;
+}
+
/* Copy up data of an inode which was copied up metadata only in the past. */
static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct path upperpath, datapath;
int err;
char *capability = NULL;
@@ -799,12 +828,12 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (c->stat.size) {
err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
- &capability, 0);
- if (err < 0 && err != -ENODATA)
+ &capability);
+ if (cap_size < 0)
goto out;
}
- err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+ err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size);
if (err)
goto out_free;
@@ -813,14 +842,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
* don't want that to happen for normal copy-up operation.
*/
if (capability) {
- err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
- capability, cap_size, 0);
+ err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
if (err)
goto out_free;
}
- err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
+ err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
goto out_free;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 1bba4813f9cb..28a075b5f5b2 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -394,7 +394,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (IS_ERR(opaquedir))
goto out_unlock;
- err = ovl_copy_xattr(upper, opaquedir);
+ err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir);
if (err)
goto out_cleanup;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 0e696f72cf65..ed35be3fafc6 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -752,7 +752,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
goto out_err;
}
if (index) {
- err = ovl_verify_origin(index, origin.dentry, false);
+ err = ovl_verify_origin(ofs, index, origin.dentry, false);
if (err)
goto out_err;
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 0d940e29d62b..efccb7c1f9bc 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -136,6 +136,13 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
static int ovl_real_fdget(const struct file *file, struct fd *real)
{
+ if (d_is_dir(file_dentry(file))) {
+ real->flags = 0;
+ real->file = ovl_dir_real_file(file, false);
+
+ return PTR_ERR_OR_ZERO(real->file);
+ }
+
return ovl_real_fdget_meta(file, real, false);
}
@@ -331,6 +338,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct fd real;
const struct cred *old_cred;
ssize_t ret;
+ int ifl = iocb->ki_flags;
if (!iov_iter_count(iter))
return 0;
@@ -346,11 +354,14 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
goto out_unlock;
+ if (!ovl_should_sync(OVL_FS(inode->i_sb)))
+ ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);
ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb->ki_flags));
+ ovl_iocb_to_rwf(ifl));
file_end_write(real.file);
/* Update size */
ovl_copyattr(ovl_inode_real(inode), inode);
@@ -370,6 +381,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
real.flags = 0;
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, real.file);
+ aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
if (ret != -EIOCBQUEUED)
@@ -433,6 +445,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
const struct cred *old_cred;
int ret;
+ if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb)))
+ return 0;
+
ret = ovl_real_fdget_meta(file, &real, !datasync);
if (ret)
return ret;
@@ -544,12 +559,28 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd,
return ret;
}
+static unsigned int ovl_iflags_to_fsflags(unsigned int iflags)
+{
+ unsigned int flags = 0;
+
+ if (iflags & S_SYNC)
+ flags |= FS_SYNC_FL;
+ if (iflags & S_APPEND)
+ flags |= FS_APPEND_FL;
+ if (iflags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+ if (iflags & S_NOATIME)
+ flags |= FS_NOATIME_FL;
+
+ return flags;
+}
+
static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int iflags)
+ unsigned long arg, unsigned int flags)
{
long ret;
struct inode *inode = file_inode(file);
- unsigned int old_iflags;
+ unsigned int oldflags;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -561,10 +592,9 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
inode_lock(inode);
/* Check the capability before cred override */
- ret = -EPERM;
- old_iflags = READ_ONCE(inode->i_flags);
- if (((iflags ^ old_iflags) & (S_APPEND | S_IMMUTABLE)) &&
- !capable(CAP_LINUX_IMMUTABLE))
+ oldflags = ovl_iflags_to_fsflags(READ_ONCE(inode->i_flags));
+ ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+ if (ret)
goto unlock;
ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
@@ -583,22 +613,6 @@ unlock:
}
-static unsigned int ovl_fsflags_to_iflags(unsigned int flags)
-{
- unsigned int iflags = 0;
-
- if (flags & FS_SYNC_FL)
- iflags |= S_SYNC;
- if (flags & FS_APPEND_FL)
- iflags |= S_APPEND;
- if (flags & FS_IMMUTABLE_FL)
- iflags |= S_IMMUTABLE;
- if (flags & FS_NOATIME_FL)
- iflags |= S_NOATIME;
-
- return iflags;
-}
-
static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -607,24 +621,23 @@ static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd,
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- return ovl_ioctl_set_flags(file, cmd, arg,
- ovl_fsflags_to_iflags(flags));
+ return ovl_ioctl_set_flags(file, cmd, arg, flags);
}
-static unsigned int ovl_fsxflags_to_iflags(unsigned int xflags)
+static unsigned int ovl_fsxflags_to_fsflags(unsigned int xflags)
{
- unsigned int iflags = 0;
+ unsigned int flags = 0;
if (xflags & FS_XFLAG_SYNC)
- iflags |= S_SYNC;
+ flags |= FS_SYNC_FL;
if (xflags & FS_XFLAG_APPEND)
- iflags |= S_APPEND;
+ flags |= FS_APPEND_FL;
if (xflags & FS_XFLAG_IMMUTABLE)
- iflags |= S_IMMUTABLE;
+ flags |= FS_IMMUTABLE_FL;
if (xflags & FS_XFLAG_NOATIME)
- iflags |= S_NOATIME;
+ flags |= FS_NOATIME_FL;
- return iflags;
+ return flags;
}
static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd,
@@ -637,10 +650,10 @@ static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd,
return -EFAULT;
return ovl_ioctl_set_flags(file, cmd, arg,
- ovl_fsxflags_to_iflags(fa.fsx_xflags));
+ ovl_fsxflags_to_fsflags(fa.fsx_xflags));
}
-static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
long ret;
@@ -665,8 +678,8 @@ static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ret;
}
-static long ovl_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
+#ifdef CONFIG_COMPAT
+long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case FS_IOC32_GETFLAGS:
@@ -683,6 +696,7 @@ static long ovl_compat_ioctl(struct file *file, unsigned int cmd,
return ovl_ioctl(file, cmd, arg);
}
+#endif
enum ovl_copyop {
OVL_COPY,
@@ -784,7 +798,9 @@ const struct file_operations ovl_file_operations = {
.fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
+#ifdef CONFIG_COMPAT
.compat_ioctl = ovl_compat_ioctl,
+#endif
.splice_read = ovl_splice_read,
.splice_write = ovl_splice_write,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 8be6cd264f66..b584dca845ba 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -327,7 +327,7 @@ static const char *ovl_get_link(struct dentry *dentry,
return p;
}
-bool ovl_is_private_xattr(const char *name)
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
{
return strncmp(name, OVL_XATTR_PREFIX,
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
@@ -391,15 +391,18 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
return res;
}
-static bool ovl_can_list(const char *s)
+static bool ovl_can_list(struct super_block *sb, const char *s)
{
+ /* Never list private (.overlay) */
+ if (ovl_is_private_xattr(sb, s))
+ return false;
+
/* List all non-trusted xatts */
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
return true;
- /* Never list trusted.overlay, list other trusted for superuser only */
- return !ovl_is_private_xattr(s) &&
- ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+ /* list other trusted for superuser only */
+ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
@@ -425,7 +428,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return -EIO;
len -= slen;
- if (!ovl_can_list(s)) {
+ if (!ovl_can_list(dentry->d_sb, s)) {
res -= slen;
memmove(s, s + slen, len);
} else {
@@ -722,8 +725,8 @@ static int ovl_set_nlink_common(struct dentry *dentry,
if (WARN_ON(len >= sizeof(buf)))
return -EIO;
- return ovl_do_setxattr(ovl_dentry_upper(dentry),
- OVL_XATTR_NLINK, buf, len, 0);
+ return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
+ OVL_XATTR_NLINK, buf, len);
}
int ovl_set_nlink_upper(struct dentry *dentry)
@@ -736,7 +739,7 @@ int ovl_set_nlink_lower(struct dentry *dentry)
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
}
-unsigned int ovl_get_nlink(struct dentry *lowerdentry,
+unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback)
{
@@ -748,7 +751,8 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry,
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
return fallback;
- err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
+ err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK,
+ &buf, sizeof(buf) - 1);
if (err < 0)
goto fail;
@@ -946,6 +950,7 @@ static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip)
{
+ struct ovl_fs *ofs = OVL_FS(sb);
struct dentry *upperdentry = oip->upperdentry;
struct ovl_path *lowerpath = oip->lowerpath;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
@@ -993,7 +998,8 @@ struct inode *ovl_get_inode(struct super_block *sb,
/* Recalculate nlink for non-dir due to indexing */
if (!is_dir)
- nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
+ nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry,
+ nlink);
set_nlink(inode, nlink);
ino = key->i_ino;
} else {
@@ -1009,7 +1015,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
ovl_inode_init(inode, oip, ino, fsid);
- if (upperdentry && ovl_is_impuredir(upperdentry))
+ if (upperdentry && ovl_is_impuredir(sb, upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
if (oip->index)
@@ -1023,7 +1029,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
- ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
+ ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index f7d4358db637..a6162c4076db 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -30,8 +30,9 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
{
int res;
char *buf;
+ struct ovl_fs *ofs = OVL_FS(d->sb);
- buf = ovl_get_redirect_xattr(dentry, prelen + strlen(post));
+ buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post));
if (IS_ERR_OR_NULL(buf))
return PTR_ERR(buf);
@@ -104,12 +105,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
return 0;
}
-static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name)
+static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox)
{
int res, err;
struct ovl_fh *fh = NULL;
- res = vfs_getxattr(dentry, name, NULL, 0);
+ res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
@@ -123,7 +125,7 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name)
if (!fh)
return ERR_PTR(-ENOMEM);
- res = vfs_getxattr(dentry, name, fh->buf, res);
+ res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res);
if (res < 0)
goto fail;
@@ -186,9 +188,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
return real;
}
-static bool ovl_is_opaquedir(struct dentry *dentry)
+static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry)
{
- return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE);
+ return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE);
}
static struct dentry *ovl_lookup_positive_unlocked(const char *name,
@@ -251,7 +253,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(this);
+ err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this);
if (err < 0)
goto out_err;
@@ -271,7 +273,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(this)) {
+ if (ovl_is_opaquedir(d->sb, this)) {
d->stop = true;
if (last_element)
d->opaque = true;
@@ -391,7 +393,7 @@ invalid:
static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
struct ovl_path **stackp)
{
- struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN);
+ struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN);
int err;
if (IS_ERR_OR_NULL(fh))
@@ -413,10 +415,10 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
* Verify that @fh matches the file handle stored in xattr @name.
* Return 0 on match, -ESTALE on mismatch, < 0 on error.
*/
-static int ovl_verify_fh(struct dentry *dentry, const char *name,
- const struct ovl_fh *fh)
+static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const struct ovl_fh *fh)
{
- struct ovl_fh *ofh = ovl_get_fh(dentry, name);
+ struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox);
int err = 0;
if (!ofh)
@@ -440,8 +442,9 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name,
*
* Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
*/
-int ovl_verify_set_fh(struct dentry *dentry, const char *name,
- struct dentry *real, bool is_upper, bool set)
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real, bool is_upper,
+ bool set)
{
struct inode *inode;
struct ovl_fh *fh;
@@ -454,9 +457,9 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name,
goto fail;
}
- err = ovl_verify_fh(dentry, name, fh);
+ err = ovl_verify_fh(ofs, dentry, ox, fh);
if (set && err == -ENODATA)
- err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0);
+ err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
if (err)
goto fail;
@@ -481,7 +484,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (!d_is_dir(index))
return dget(index);
- fh = ovl_get_fh(index, OVL_XATTR_UPPER);
+ fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER);
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
@@ -574,7 +577,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
goto fail;
}
- err = ovl_verify_fh(upper, OVL_XATTR_ORIGIN, fh);
+ err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh);
dput(upper);
if (err)
goto fail;
@@ -585,7 +588,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
if (err)
goto fail;
- if (ovl_get_nlink(origin.dentry, index, 0) == 0)
+ if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0)
goto orphan;
}
@@ -741,7 +744,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
}
/* Verify that dir index 'upper' xattr points to upper dir */
- err = ovl_verify_upper(index, upper, false);
+ err = ovl_verify_upper(ofs, index, upper, false);
if (err) {
if (err == -ESTALE) {
pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n",
@@ -790,12 +793,12 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
}
/* Fix missing 'origin' xattr */
-static int ovl_fix_origin(struct dentry *dentry, struct dentry *lower,
- struct dentry *upper)
+static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
+ struct dentry *lower, struct dentry *upper)
{
int err;
- if (ovl_check_origin_xattr(upper))
+ if (ovl_check_origin_xattr(ofs, upper))
return 0;
err = ovl_want_write(dentry);
@@ -920,7 +923,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* of lower dir and set upper parent "impure".
*/
if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) {
- err = ovl_fix_origin(dentry, this, upperdentry);
+ err = ovl_fix_origin(ofs, dentry, this, upperdentry);
if (err) {
dput(this);
goto out_put;
@@ -939,7 +942,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperdentry && !ctr &&
((d.is_dir && ovl_verify_lower(dentry->d_sb)) ||
(!d.is_dir && ofs->config.index && origin_path))) {
- err = ovl_verify_origin(upperdentry, this, false);
+ err = ovl_verify_origin(ofs, upperdentry, this, false);
if (err) {
dput(this);
if (d.is_dir)
@@ -1060,13 +1063,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ovl_dentry_set_upper_alias(dentry);
else if (index) {
upperdentry = dget(index);
- upperredirect = ovl_get_redirect_xattr(upperdentry, 0);
+ upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
upperredirect = NULL;
goto out_free_oe;
}
- err = ovl_check_metacopy_xattr(upperdentry);
+ err = ovl_check_metacopy_xattr(ofs, upperdentry);
if (err < 0)
goto out_free_oe;
uppermetacopy = err;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 29bc1ec699e7..f8880aa2ba0e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -23,13 +23,16 @@ enum ovl_path_type {
#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
-#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
-#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
-#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
-#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure"
-#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
-#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper"
-#define OVL_XATTR_METACOPY OVL_XATTR_PREFIX "metacopy"
+
+enum ovl_xattr {
+ OVL_XATTR_OPAQUE,
+ OVL_XATTR_REDIRECT,
+ OVL_XATTR_ORIGIN,
+ OVL_XATTR_IMPURE,
+ OVL_XATTR_NLINK,
+ OVL_XATTR_UPPER,
+ OVL_XATTR_METACOPY,
+};
enum ovl_inode_flag {
/* Pure upper dir that may contain non pure upper entries */
@@ -110,6 +113,12 @@ struct ovl_fh {
#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \
offsetof(struct ovl_fb, fid))
+extern const char *ovl_xattr_table[];
+static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
+{
+ return ovl_xattr_table[ox];
+}
+
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
@@ -170,17 +179,29 @@ static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
return err;
}
-static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ const char *name = ovl_xattr(ofs, ox);
+ return vfs_getxattr(dentry, name, value, size);
+}
+
+static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const void *value,
+ size_t size)
{
- int err = vfs_setxattr(dentry, name, value, size, flags);
- pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n",
- dentry, name, min((int)size, 48), value, size, flags, err);
+ const char *name = ovl_xattr(ofs, ox);
+ int err = vfs_setxattr(dentry, name, value, size, 0);
+ pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
+ dentry, name, min((int)size, 48), value, size, err);
return err;
}
-static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox)
{
+ const char *name = ovl_xattr(ofs, ox);
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
@@ -280,10 +301,11 @@ struct file *ovl_path_open(struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_check_origin_xattr(struct dentry *dentry);
-bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
+bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry);
+bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
+ enum ovl_xattr ox);
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
- const char *name, const void *value, size_t size,
+ enum ovl_xattr ox, const void *value, size_t size,
int xerr);
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
void ovl_set_flag(unsigned long flag, struct inode *inode);
@@ -296,15 +318,15 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct dentry *dentry);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
-ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
- size_t padding);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ int padding);
-static inline bool ovl_is_impuredir(struct dentry *dentry)
+static inline bool ovl_is_impuredir(struct super_block *sb,
+ struct dentry *dentry)
{
- return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
+ return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE);
}
/*
@@ -365,8 +387,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
bool connected);
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp);
-int ovl_verify_set_fh(struct dentry *dentry, const char *name,
- struct dentry *real, bool is_upper, bool set);
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real, bool is_upper,
+ bool set);
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
int ovl_get_index_name(struct dentry *origin, struct qstr *name);
@@ -378,20 +401,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
bool ovl_lower_positive(struct dentry *dentry);
-static inline int ovl_verify_origin(struct dentry *upper,
+static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool set)
{
- return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set);
+ return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin,
+ false, set);
}
-static inline int ovl_verify_upper(struct dentry *index,
- struct dentry *upper, bool set)
+static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
+ struct dentry *upper, bool set)
{
- return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set);
+ return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set);
}
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
+struct file *ovl_dir_real_file(const struct file *file, bool want_upper);
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
@@ -404,7 +429,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs);
/* inode.c */
int ovl_set_nlink_upper(struct dentry *dentry);
int ovl_set_nlink_lower(struct dentry *dentry);
-unsigned int ovl_get_nlink(struct dentry *lowerdentry,
+unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback);
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
@@ -418,7 +443,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
struct posix_acl *ovl_get_acl(struct inode *inode, int type);
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
-bool ovl_is_private_xattr(const char *name);
+bool ovl_is_private_xattr(struct super_block *sb, const char *name);
struct ovl_inode_params {
struct inode *newinode;
@@ -479,12 +504,15 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
+long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
-int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
+ struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index b429c80879ee..1b5a2094df8e 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -17,6 +17,7 @@ struct ovl_config {
bool nfs_export;
int xino;
bool metacopy;
+ bool ovl_volatile;
};
struct ovl_sb {
@@ -90,6 +91,11 @@ static inline struct ovl_fs *OVL_FS(struct super_block *sb)
return (struct ovl_fs *)sb->s_fs_info;
}
+static inline bool ovl_should_sync(struct ovl_fs *ofs)
+{
+ return !ofs->config.ovl_volatile;
+}
+
/* private information held for every overlayfs dentry */
struct ovl_entry {
union {
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 6918b98faeb6..01620ebae1bd 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -606,6 +606,7 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(d_inode(dentry));
@@ -632,7 +633,7 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
* Removing the "impure" xattr is best effort.
*/
if (!ovl_want_write(dentry)) {
- ovl_do_removexattr(ovl_dentry_upper(dentry),
+ ovl_do_removexattr(ofs, ovl_dentry_upper(dentry),
OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
@@ -839,7 +840,7 @@ out_unlock:
return res;
}
-static struct file *ovl_dir_open_realfile(struct file *file,
+static struct file *ovl_dir_open_realfile(const struct file *file,
struct path *realpath)
{
struct file *res;
@@ -852,16 +853,22 @@ static struct file *ovl_dir_open_realfile(struct file *file,
return res;
}
-static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
+/*
+ * Like ovl_real_fdget(), returns upperfile if dir was copied up since open.
+ * Unlike ovl_real_fdget(), this caches upperfile in file->private_data.
+ *
+ * TODO: use same abstract type for file->private_data of dir and file so
+ * upperfile could also be cached for files as well.
+ */
+struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
{
+
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
- /* Nothing to sync for lower */
if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
- return 0;
+ return want_upper ? NULL : realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
@@ -880,7 +887,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
- return PTR_ERR(realfile);
+ return realfile;
}
smp_store_release(&od->upperfile, realfile);
} else {
@@ -893,6 +900,25 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
}
}
+ return realfile;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct file *realfile;
+ int err;
+
+ if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb)))
+ return 0;
+
+ realfile = ovl_dir_real_file(file, true);
+ err = PTR_ERR_OR_ZERO(realfile);
+
+ /* Nothing to sync for lower */
+ if (!realfile || err)
+ return err;
+
return vfs_fsync_range(realfile, start, end, datasync);
}
@@ -945,6 +971,10 @@ const struct file_operations ovl_dir_operations = {
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
+ .unlocked_ioctl = ovl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ovl_compat_ioctl,
+#endif
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
@@ -1051,7 +1081,9 @@ int ovl_check_d_type_supported(struct path *realpath)
return rdd.d_type_supported;
}
-static void ovl_workdir_cleanup_recurse(struct path *path, int level)
+#define OVL_INCOMPATDIR_NAME "incompat"
+
+static int ovl_workdir_cleanup_recurse(struct path *path, int level)
{
int err;
struct inode *dir = path->dentry->d_inode;
@@ -1065,6 +1097,19 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level)
.root = &root,
.is_lowest = false,
};
+ bool incompat = false;
+
+ /*
+ * The "work/incompat" directory is treated specially - if it is not
+ * empty, instead of printing a generic error and mounting read-only,
+ * we will error about incompat features and fail the mount.
+ *
+ * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name
+ * starts with '#'.
+ */
+ if (level == 2 &&
+ !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME))
+ incompat = true;
err = ovl_dir_read(path, &rdd);
if (err)
@@ -1079,17 +1124,25 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
+ } else if (incompat) {
+ pr_err("overlay with incompat feature '%s' cannot be mounted\n",
+ p->name);
+ err = -EINVAL;
+ break;
}
dentry = lookup_one_len(p->name, path->dentry, p->len);
if (IS_ERR(dentry))
continue;
if (dentry->d_inode)
- ovl_workdir_cleanup(dir, path->mnt, dentry, level);
+ err = ovl_workdir_cleanup(dir, path->mnt, dentry, level);
dput(dentry);
+ if (err)
+ break;
}
inode_unlock(dir);
out:
ovl_cache_free(&list);
+ return err;
}
int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
@@ -1106,9 +1159,10 @@ int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
struct path path = { .mnt = mnt, .dentry = dentry };
inode_unlock(dir);
- ovl_workdir_cleanup_recurse(&path, level + 1);
+ err = ovl_workdir_cleanup_recurse(&path, level + 1);
inode_lock_nested(dir, I_MUTEX_PARENT);
- err = ovl_cleanup(dir, dentry);
+ if (!err)
+ err = ovl_cleanup(dir, dentry);
}
return err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4b38141c2985..290983bcfbb3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -264,6 +264,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
if (!ovl_upper_mnt(ofs))
return 0;
+ if (!ovl_should_sync(ofs))
+ return 0;
/*
* Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
* All the super blocks will be iterated, including upper_sb.
@@ -362,6 +364,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
if (ofs->config.metacopy != ovl_metacopy_def)
seq_printf(m, ",metacopy=%s",
ofs->config.metacopy ? "on" : "off");
+ if (ofs->config.ovl_volatile)
+ seq_puts(m, ",volatile");
return 0;
}
@@ -376,9 +380,11 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
if (*flags & SB_RDONLY && !sb_rdonly(sb)) {
upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
- down_read(&upper_sb->s_umount);
- ret = sync_filesystem(upper_sb);
- up_read(&upper_sb->s_umount);
+ if (ovl_should_sync(ofs)) {
+ down_read(&upper_sb->s_umount);
+ ret = sync_filesystem(upper_sb);
+ up_read(&upper_sb->s_umount);
+ }
}
return ret;
@@ -411,6 +417,7 @@ enum {
OPT_XINO_AUTO,
OPT_METACOPY_ON,
OPT_METACOPY_OFF,
+ OPT_VOLATILE,
OPT_ERR,
};
@@ -429,6 +436,7 @@ static const match_table_t ovl_tokens = {
{OPT_XINO_AUTO, "xino=auto"},
{OPT_METACOPY_ON, "metacopy=on"},
{OPT_METACOPY_OFF, "metacopy=off"},
+ {OPT_VOLATILE, "volatile"},
{OPT_ERR, NULL}
};
@@ -573,6 +581,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
metacopy_opt = true;
break;
+ case OPT_VOLATILE:
+ config->ovl_volatile = true;
+ break;
+
default:
pr_err("unrecognized mount option \"%s\" or missing value\n",
p);
@@ -595,6 +607,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->index = false;
}
+ if (!config->upperdir && config->ovl_volatile) {
+ pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n");
+ config->ovl_volatile = false;
+ }
+
err = ovl_parse_redirect_mode(config, config->redirect_mode);
if (err)
return err;
@@ -705,8 +722,12 @@ retry:
goto out_unlock;
retried = true;
- ovl_workdir_cleanup(dir, mnt, work, 0);
+ err = ovl_workdir_cleanup(dir, mnt, work, 0);
dput(work);
+ if (err == -EINVAL) {
+ work = ERR_PTR(err);
+ goto out_unlock;
+ }
goto retry;
}
@@ -1199,11 +1220,50 @@ out_unlock:
return err;
}
+static struct dentry *ovl_lookup_or_create(struct dentry *parent,
+ const char *name, umode_t mode)
+{
+ size_t len = strlen(name);
+ struct dentry *child;
+
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ child = lookup_one_len(name, parent, len);
+ if (!IS_ERR(child) && !child->d_inode)
+ child = ovl_create_real(parent->d_inode, child,
+ OVL_CATTR(mode));
+ inode_unlock(parent->d_inode);
+ dput(parent);
+
+ return child;
+}
+
+/*
+ * Creates $workdir/work/incompat/volatile/dirty file if it is not already
+ * present.
+ */
+static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
+{
+ unsigned int ctr;
+ struct dentry *d = dget(ofs->workbasedir);
+ static const char *const volatile_path[] = {
+ OVL_WORKDIR_NAME, "incompat", "volatile", "dirty"
+ };
+ const char *const *name = volatile_path;
+
+ for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) {
+ d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ }
+ dput(d);
+ return 0;
+}
+
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
struct path *workpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
- struct dentry *temp;
+ struct dentry *temp, *workdir;
bool rename_whiteout;
bool d_type;
int fh_type;
@@ -1213,10 +1273,13 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
if (err)
return err;
- ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
- if (!ofs->workdir)
+ workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
+ err = PTR_ERR(workdir);
+ if (IS_ERR_OR_NULL(workdir))
goto out;
+ ofs->workdir = workdir;
+
err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir");
if (err)
goto out;
@@ -1256,7 +1319,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
/*
* Check if upper/work fs supports trusted.overlay.* xattr
*/
- err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0);
+ err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
ofs->noxattr = true;
ofs->config.index = false;
@@ -1264,7 +1327,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n");
err = 0;
} else {
- vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
+ ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
}
/*
@@ -1279,6 +1342,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
goto out;
}
+ /*
+ * For volatile mount, create a incompat/volatile/dirty file to keep
+ * track of it.
+ */
+ if (ofs->config.ovl_volatile) {
+ err = ovl_create_volatile_dirty(ofs);
+ if (err < 0) {
+ pr_err("Failed to create volatile/dirty file.\n");
+ goto out;
+ }
+ }
+
/* Check if upper/work fs supports file handles */
fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
if (ofs->config.index && !fh_type) {
@@ -1347,6 +1422,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
struct ovl_entry *oe, struct path *upperpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
+ struct dentry *indexdir;
int err;
err = mnt_want_write(mnt);
@@ -1354,8 +1430,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
return err;
/* Verify lower root is upper root origin */
- err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
- true);
+ err = ovl_verify_origin(ofs, upperpath->dentry,
+ oe->lowerstack[0].dentry, true);
if (err) {
pr_err("failed to verify upper root origin\n");
goto out;
@@ -1366,9 +1442,12 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
ofs->workdir_trap = NULL;
dput(ofs->workdir);
ofs->workdir = NULL;
- ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
- if (ofs->indexdir) {
- ofs->workdir = dget(ofs->indexdir);
+ indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
+ if (IS_ERR(indexdir)) {
+ err = PTR_ERR(indexdir);
+ } else if (indexdir) {
+ ofs->indexdir = indexdir;
+ ofs->workdir = dget(indexdir);
err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
"indexdir");
@@ -1383,13 +1462,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
* "trusted.overlay.upper" to indicate that index may have
* directory entries.
*/
- if (ovl_check_origin_xattr(ofs->indexdir)) {
- err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN,
+ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
+ err = ovl_verify_set_fh(ofs, ofs->indexdir,
+ OVL_XATTR_ORIGIN,
upperpath->dentry, true, false);
if (err)
pr_err("failed to verify index dir 'origin' xattr\n");
}
- err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true);
+ err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry,
+ true);
if (err)
pr_err("failed to verify index dir 'upper' xattr\n");
@@ -1755,7 +1836,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ino = d_inode(upperdentry)->i_ino;
fsid = 0;
ovl_dentry_set_upper_alias(root);
- if (ovl_is_impuredir(upperdentry))
+ if (ovl_is_impuredir(sb, upperdentry))
ovl_set_flag(OVL_IMPURE, d_inode(root));
}
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 56c1f89f20c9..23f475627d07 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -544,11 +544,11 @@ void ovl_copy_up_end(struct dentry *dentry)
ovl_inode_unlock(d_inode(dentry));
}
-bool ovl_check_origin_xattr(struct dentry *dentry)
+bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry)
{
int res;
- res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0);
/* Zero size value means "copied up but origin unknown" */
if (res >= 0)
@@ -557,7 +557,8 @@ bool ovl_check_origin_xattr(struct dentry *dentry)
return false;
}
-bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
+bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
+ enum ovl_xattr ox)
{
int res;
char val;
@@ -565,15 +566,36 @@ bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
if (!d_is_dir(dentry))
return false;
- res = vfs_getxattr(dentry, name, &val, 1);
+ res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1);
if (res == 1 && val == 'y')
return true;
return false;
}
+#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
+#define OVL_XATTR_REDIRECT_POSTFIX "redirect"
+#define OVL_XATTR_ORIGIN_POSTFIX "origin"
+#define OVL_XATTR_IMPURE_POSTFIX "impure"
+#define OVL_XATTR_NLINK_POSTFIX "nlink"
+#define OVL_XATTR_UPPER_POSTFIX "upper"
+#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
+
+#define OVL_XATTR_TAB_ENTRY(x) \
+ [x] = OVL_XATTR_PREFIX x ## _POSTFIX
+
+const char *ovl_xattr_table[] = {
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
+};
+
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
- const char *name, const void *value, size_t size,
+ enum ovl_xattr ox, const void *value, size_t size,
int xerr)
{
int err;
@@ -582,10 +604,10 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
if (ofs->noxattr)
return xerr;
- err = ovl_do_setxattr(upperdentry, name, value, size, 0);
+ err = ovl_do_setxattr(ofs, upperdentry, ox, value, size);
if (err == -EOPNOTSUPP) {
- pr_warn("cannot set %s xattr on upper\n", name);
+ pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox));
ofs->noxattr = true;
return xerr;
}
@@ -845,7 +867,7 @@ err:
}
/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct dentry *dentry)
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry)
{
int res;
@@ -853,7 +875,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry)
if (!S_ISREG(d_inode(dentry)->i_mode))
return 0;
- res = vfs_getxattr(dentry, OVL_XATTR_METACOPY, NULL, 0);
+ res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return 0;
@@ -882,49 +904,27 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
- size_t padding)
-{
- ssize_t res;
- char *buf = NULL;
-
- res = vfs_getxattr(dentry, name, NULL, 0);
- if (res < 0) {
- if (res == -ENODATA || res == -EOPNOTSUPP)
- return -ENODATA;
- goto fail;
- }
-
- if (res != 0) {
- buf = kzalloc(res + padding, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- res = vfs_getxattr(dentry, name, buf, res);
- if (res < 0)
- goto fail;
- }
- *value = buf;
-
- return res;
-
-fail:
- pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n",
- name, res);
- kfree(buf);
- return res;
-}
-
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ int padding)
{
int res;
char *s, *next, *buf = NULL;
- res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
- if (res == -ENODATA)
+ res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
if (res < 0)
- return ERR_PTR(res);
+ goto fail;
+ if (res == 0)
+ goto invalid;
+
+ buf = kzalloc(res + padding + 1, GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res);
+ if (res < 0)
+ goto fail;
if (res == 0)
goto invalid;
@@ -943,6 +943,10 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
invalid:
pr_warn_ratelimited("invalid redirect (%s)\n", buf);
res = -EINVAL;
+ goto err_free;
+fail:
+ pr_warn_ratelimited("failed to get redirect (%i)\n", res);
+err_free:
kfree(buf);
return ERR_PTR(res);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 60dbee457143..0ac197658a2d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
}
}
-/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct pipe_inode_info *pipe)
-{
- DEFINE_WAIT(rdwait);
- DEFINE_WAIT(wrwait);
-
- /*
- * Pipes are system-local resources, so sleeping on them
- * is considered a noninteractive wait:
- */
- prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
- prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
- pipe_unlock(pipe);
- schedule();
- finish_wait(&pipe->rd_wait, &rdwait);
- finish_wait(&pipe->wr_wait, &wrwait);
- pipe_lock(pipe);
-}
-
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
+ int error;
if (!inode)
return -ENFILE;
if (flags & O_NOTIFICATION_PIPE) {
-#ifdef CONFIG_WATCH_QUEUE
- if (watch_queue_init(inode->i_pipe) < 0) {
+ error = watch_queue_init(inode->i_pipe);
+ if (error) {
+ free_pipe_info(inode->i_pipe);
iput(inode);
- return -ENOMEM;
+ return error;
}
-#else
- return -ENOPKG;
-#endif
}
f = alloc_file_pseudo(inode, pipe_mnt, "",
@@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
return do_pipe2(fildes, 0);
}
+/*
+ * This is the stupid "wait for pipe to be readable or writable"
+ * model.
+ *
+ * See pipe_read/write() for the proper kind of exclusive wait,
+ * but that requires that we wake up any other readers/writers
+ * if we then do not end up reading everything (ie the whole
+ * "wake_next_reader/writer" logic in pipe_read/write()).
+ */
+void pipe_wait_readable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
+ pipe_lock(pipe);
+}
+
+void pipe_wait_writable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
+ pipe_lock(pipe);
+}
+
+/*
+ * This depends on both the wait (here) and the wakeup (wake_up_partner)
+ * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
+ * race with the count check and waitqueue prep.
+ *
+ * Normally in order to avoid races, you'd do the prepare_to_wait() first,
+ * then check the condition you're waiting for, and only then sleep. But
+ * because of the pipe lock, we can check the condition before being on
+ * the wait queue.
+ *
+ * We use the 'rd_wait' waitqueue for pipe partner waiting.
+ */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
+ DEFINE_WAIT(rdwait);
int cur = *cnt;
while (cur == *cnt) {
- pipe_wait(pipe);
+ prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+ pipe_unlock(pipe);
+ schedule();
+ finish_wait(&pipe->rd_wait, &rdwait);
+ pipe_lock(pipe);
if (signal_pending(current))
break;
}
@@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
static void wake_up_partner(struct pipe_inode_info *pipe)
{
wake_up_interruptible_all(&pipe->rd_wait);
- wake_up_interruptible_all(&pipe->wr_wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 617db4e0faa0..0f707003dda5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
- static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
@@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (atomic_read(&p->mm->mm_users) > 1) {
+ if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
mm = p->mm;
mmgrab(mm);
}
@@ -1269,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
kuid_t kloginuid;
int rv;
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f909243d4a66..9f1077d94cde 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_64BIT
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
return u;
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5066b0251ed8..217aa2705d5d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
linear_page_index(vma, addr));
- if (!page)
- return;
-
if (xa_is_value(page))
mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
return;
}
@@ -653,6 +647,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
@@ -723,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pte_hole = smaps_pte_hole,
};
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss)
+ struct mem_size_stats *mss, unsigned long start)
{
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
#ifdef CONFIG_SHMEM
/* In case of smaps_rollup, reset the value from previous vma */
mss->check_shmem_swap = false;
@@ -742,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma,
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
- walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
- return;
+ ops = &smaps_shmem_walk_ops;
}
}
#endif
/* mmap_lock is held in m_start */
- walk_page_vma(vma, &smaps_walk_ops, mss);
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
#define SEQ_PUT_DEC(str, val) \
@@ -805,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v)
memset(&mss, 0, sizeof(mss));
- smap_gather_stats(vma, &mss);
+ smap_gather_stats(vma, &mss, 0);
show_map_vma(m, vma);
@@ -853,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
hold_task_mempolicy(priv);
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
- smap_gather_stats(vma, &mss);
+ for (vma = priv->mm->mmap; vma;) {
+ smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA2.
+ * Iterate the loop like the original one.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * find_vma(mm, 16k - 1) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * find_vma(mm, 16k - 1) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = find_vma(mm, last_vma_end - 1);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 above */
+ if (vma->vm_start >= last_vma_end)
+ continue;
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end)
+ smap_gather_stats(vma, &mss, last_vma_end);
+ }
+ /* Case 2 above */
+ vma = vma->vm_next;
}
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
@@ -1168,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
- /*
- * Avoid to modify vma->vm_flags
- * without locked ops while the
- * coredump reads the vm_flags.
- */
- if (!mmget_still_valid(mm)) {
- /*
- * Silently return "count"
- * like if get_task_mm()
- * failed. FIXME: should this
- * function have returned
- * -ESRCH if get_task_mm()
- * failed like if
- * get_proc_task() fails?
- */
- mmap_write_unlock(mm);
- goto out_mm;
- }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index d1ceb76adb71..b59cd172b5f9 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -70,8 +70,3 @@ config QFMT_V2
config QUOTACTL
bool
default n
-
-config QUOTACTL_COMPAT
- bool
- depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
- default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index f2b49d0f0287..9160639daffa 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o kqid.o
-obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
deleted file mode 100644
index c30572857619..000000000000
--- a/fs/quota/compat.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-#include <linux/quotaops.h>
-
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
- compat_u64 dqb_bhardlimit;
- compat_u64 dqb_bsoftlimit;
- compat_u64 dqb_curspace;
- compat_u64 dqb_ihardlimit;
- compat_u64 dqb_isoftlimit;
- compat_u64 dqb_curinodes;
- compat_u64 dqb_btime;
- compat_u64 dqb_itime;
- compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
- compat_u64 dqb_bhardlimit;
- compat_u64 qfs_nblks;
- compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
- __s8 qs_version;
- __u16 qs_flags;
- __s8 qs_pad;
- struct compat_fs_qfilestat qs_uquota;
- struct compat_fs_qfilestat qs_gquota;
- compat_uint_t qs_incoredqs;
- compat_int_t qs_btimelimit;
- compat_int_t qs_itimelimit;
- compat_int_t qs_rtbtimelimit;
- __u16 qs_bwarnlimit;
- __u16 qs_iwarnlimit;
-};
-
-COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd,
- const char __user *, special, qid_t, id,
- void __user *, addr)
-{
- unsigned int cmds;
- struct if_dqblk __user *dqblk;
- struct compat_if_dqblk __user *compat_dqblk;
- struct fs_quota_stat __user *fsqstat;
- struct compat_fs_quota_stat __user *compat_fsqstat;
- compat_uint_t data;
- u16 xdata;
- long ret;
-
- cmds = cmd >> SUBCMDSHIFT;
-
- switch (cmds) {
- case Q_GETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- if (ret)
- break;
- if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &dqblk->dqb_valid) ||
- put_user(data, &compat_dqblk->dqb_valid))
- ret = -EFAULT;
- break;
- case Q_SETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = -EFAULT;
- if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &compat_dqblk->dqb_valid) ||
- put_user(data, &dqblk->dqb_valid))
- break;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- break;
- case Q_XGETQSTAT:
- fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
- compat_fsqstat = addr;
- ret = kernel_quotactl(cmd, special, id, fsqstat);
- if (ret)
- break;
- ret = -EFAULT;
- /* Copying qs_version, qs_flags, qs_pad */
- if (copy_in_user(compat_fsqstat, fsqstat,
- offsetof(struct compat_fs_quota_stat, qs_uquota)))
- break;
- /* Copying qs_uquota */
- if (copy_in_user(&compat_fsqstat->qs_uquota,
- &fsqstat->qs_uquota,
- sizeof(compat_fsqstat->qs_uquota)) ||
- get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
- break;
- /* Copying qs_gquota */
- if (copy_in_user(&compat_fsqstat->qs_gquota,
- &fsqstat->qs_gquota,
- sizeof(compat_fsqstat->qs_gquota)) ||
- get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
- break;
- /* Copying the rest */
- if (copy_in_user(&compat_fsqstat->qs_incoredqs,
- &fsqstat->qs_incoredqs,
- sizeof(struct compat_fs_quota_stat) -
- offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
- get_user(xdata, &fsqstat->qs_iwarnlimit) ||
- put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
- break;
- ret = 0;
- break;
- default:
- ret = kernel_quotactl(cmd, special, id, addr);
- }
- return ret;
-}
diff --git a/fs/quota/compat.h b/fs/quota/compat.h
new file mode 100644
index 000000000000..ef7d1e12d650
--- /dev/null
+++ b/fs/quota/compat.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+
+struct compat_if_dqblk {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_u64 dqb_ihardlimit;
+ compat_u64 dqb_isoftlimit;
+ compat_u64 dqb_curinodes;
+ compat_u64 dqb_btime;
+ compat_u64 dqb_itime;
+ compat_uint_t dqb_valid;
+};
+
+struct compat_fs_qfilestat {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 qfs_nblks;
+ compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+ __s8 qs_version;
+ __u16 qs_flags;
+ __s8 qs_pad;
+ struct compat_fs_qfilestat qs_uquota;
+ struct compat_fs_qfilestat qs_gquota;
+ compat_uint_t qs_incoredqs;
+ compat_int_t qs_btimelimit;
+ compat_int_t qs_itimelimit;
+ compat_int_t qs_rtbtimelimit;
+ __u16 qs_bwarnlimit;
+ __u16 qs_iwarnlimit;
+};
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 47f9e151988b..9af95c7a0bbe 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <linux/writeback.h>
#include <linux/nospec.h>
+#include "compat.h"
static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
qid_t id)
@@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
if (ret)
return ret;
copy_to_if_dqblk(&idq, &fdq);
- if (copy_to_user(addr, &idq, sizeof(idq)))
- return -EFAULT;
+
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk)))
+ return -EFAULT;
+ if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
+ }
return 0;
}
@@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
struct if_dqblk idq;
struct kqid qid;
- if (copy_from_user(&idq, addr, sizeof(idq)))
- return -EFAULT;
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) ||
+ get_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_from_user(&idq, addr, sizeof(idq)))
+ return -EFAULT;
+ }
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
@@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type,
return 0;
}
+static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to,
+ struct fs_qfilestat *from)
+{
+ if (copy_to_user(to, from, sizeof(*to)) ||
+ put_user(from->qfs_nextents, &to->qfs_nextents))
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to,
+ struct fs_quota_stat *from)
+{
+ if (put_user(from->qs_version, &to->qs_version) ||
+ put_user(from->qs_flags, &to->qs_flags) ||
+ put_user(from->qs_pad, &to->qs_pad) ||
+ compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) ||
+ compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) ||
+ put_user(from->qs_incoredqs, &to->qs_incoredqs) ||
+ put_user(from->qs_btimelimit, &to->qs_btimelimit) ||
+ put_user(from->qs_itimelimit, &to->qs_itimelimit) ||
+ put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) ||
+ put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) ||
+ put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit))
+ return -EFAULT;
+ return 0;
+}
+
static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
{
struct fs_quota_stat fqs;
@@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
if (!sb->s_qcop->get_state)
return -ENOSYS;
ret = quota_getstate(sb, type, &fqs);
- if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+ if (ret)
+ return ret;
+
+ if (compat_need_64bit_alignment_fixup())
+ return compat_copy_fs_quota_stat(addr, &fqs);
+ if (copy_to_user(addr, &fqs, sizeof(fqs)))
return -EFAULT;
- return ret;
+ return 0;
}
static int quota_getstatev(struct super_block *sb, int type,
@@ -481,6 +532,14 @@ static inline u64 quota_btobb(u64 bytes)
return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
}
+static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d,
+ __s32 timer, __s8 timer_hi)
+{
+ if (d->d_fieldmask & FS_DQ_BIGTIME)
+ return (u32)timer | (s64)timer_hi << 32;
+ return timer;
+}
+
static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
{
dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
@@ -489,14 +548,17 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
dst->d_ino_softlimit = src->d_ino_softlimit;
dst->d_space = quota_bbtob(src->d_bcount);
dst->d_ino_count = src->d_icount;
- dst->d_ino_timer = src->d_itimer;
- dst->d_spc_timer = src->d_btimer;
+ dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer,
+ src->d_itimer_hi);
+ dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer,
+ src->d_btimer_hi);
dst->d_ino_warns = src->d_iwarns;
dst->d_spc_warns = src->d_bwarns;
dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
dst->d_rt_space = quota_bbtob(src->d_rtbcount);
- dst->d_rt_spc_timer = src->d_rtbtimer;
+ dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer,
+ src->d_rtbtimer_hi);
dst->d_rt_spc_warns = src->d_rtbwarns;
dst->d_fieldmask = 0;
if (src->d_fieldmask & FS_DQ_ISOFT)
@@ -588,10 +650,26 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
return sb->s_qcop->set_dqblk(sb, qid, &qdq);
}
+static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d,
+ __s32 *timer_lo, __s8 *timer_hi, s64 timer)
+{
+ *timer_lo = timer;
+ if (d->d_fieldmask & FS_DQ_BIGTIME)
+ *timer_hi = timer >> 32;
+}
+
+static inline bool want_bigtime(s64 timer)
+{
+ return timer > S32_MAX || timer < S32_MIN;
+}
+
static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
int type, qid_t id)
{
memset(dst, 0, sizeof(*dst));
+ if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) ||
+ want_bigtime(src->d_rt_spc_timer))
+ dst->d_fieldmask |= FS_DQ_BIGTIME;
dst->d_version = FS_DQUOT_VERSION;
dst->d_id = id;
if (type == USRQUOTA)
@@ -606,14 +684,17 @@ static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
dst->d_ino_softlimit = src->d_ino_softlimit;
dst->d_bcount = quota_btobb(src->d_space);
dst->d_icount = src->d_ino_count;
- dst->d_itimer = src->d_ino_timer;
- dst->d_btimer = src->d_spc_timer;
+ copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi,
+ src->d_ino_timer);
+ copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi,
+ src->d_spc_timer);
dst->d_iwarns = src->d_ino_warns;
dst->d_bwarns = src->d_spc_warns;
dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
dst->d_rtbcount = quota_btobb(src->d_rt_space);
- dst->d_rtbtimer = src->d_rt_spc_timer;
+ copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi,
+ src->d_rt_spc_timer);
dst->d_rtbwarns = src->d_rt_spc_warns;
}
@@ -816,8 +897,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
* calls. Maybe we need to add the process quotas etc. in the future,
* but we probably should use rlimits for that.
*/
-int kernel_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+ qid_t, id, void __user *, addr)
{
uint cmds, type;
struct super_block *sb = NULL;
@@ -871,9 +952,3 @@ out:
path_put(pathp);
return ret;
}
-
-SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
- qid_t, id, void __user *, addr)
-{
- return kernel_quotactl(cmd, special, id, addr);
-}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 58fc2a7c7fd1..e69a2bfdd81c 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -282,6 +282,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
+ d->dqb_pad = 0;
if (qtree_entry_unused(info, dp))
d->dqb_itime = cpu_to_le64(1);
}
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 414695454956..355523f4a4bf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
if (!pages)
goto out_free;
- nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
+ nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages);
if (nr != lpages)
goto out_free_pages; /* leave if some pages were missing */
diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c78d0..19f5c4bf75aa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
+/*
+ * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
+ * but autofs is one of the few internal kernel users that actually
+ * wants this _and_ can be built as a module. So we need to export
+ * this symbol for autofs, even though it really isn't appropriate
+ * for any other kernel modules.
+ */
+EXPORT_SYMBOL_GPL(__kernel_write);
ssize_t kernel_write(struct file *file, const void *buf, size_t count,
loff_t *pos)
@@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-/**
- * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
- * into the kernel and check that it is valid.
- *
- * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
- * @uvector: Pointer to the userspace array.
- * @nr_segs: Number of elements in userspace array.
- * @fast_segs: Number of elements in @fast_pointer.
- * @fast_pointer: Pointer to (usually small on-stack) kernel array.
- * @ret_pointer: (output parameter) Pointer to a variable that will point to
- * either @fast_pointer, a newly allocated kernel array, or NULL,
- * depending on which array was used.
- *
- * This function copies an array of &struct iovec of @nr_segs from
- * userspace into the kernel and checks that each element is valid (e.g.
- * it does not point to a kernel address or cause overflow by being too
- * large, etc.).
- *
- * As an optimization, the caller may provide a pointer to a small
- * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
- * (the size of this array, or 0 if unused, should be given in @fast_segs).
- *
- * @ret_pointer will always point to the array that was used, so the
- * caller must take care not to call kfree() on it e.g. in case the
- * @fast_pointer array was used and it was allocated on the stack.
- *
- * Return: The total number of bytes covered by the iovec array on success
- * or a negative error code on error.
- */
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- unsigned long nr_segs, unsigned long fast_segs,
- struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- unsigned long seg;
- ssize_t ret;
- struct iovec *iov = fast_pointer;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0) {
- ret = 0;
- goto out;
- }
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
- if (nr_segs > UIO_MAXIOV) {
- ret = -EINVAL;
- goto out;
- }
- if (nr_segs > fast_segs) {
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- }
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * According to the Single Unix Specification we should return EINVAL
- * if an element length is < 0 when cast to ssize_t or if the
- * total length would overflow the ssize_t return value of the
- * system call.
- *
- * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
- * overflow case.
- */
- ret = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- void __user *buf = iov[seg].iov_base;
- ssize_t len = (ssize_t)iov[seg].iov_len;
-
- /* see if we we're about to use an invalid len or if
- * it's about to overflow ssize_t */
- if (len < 0) {
- ret = -EINVAL;
- goto out;
- }
- if (type >= 0
- && unlikely(!access_ok(buf, len))) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - ret) {
- len = MAX_RW_COUNT - ret;
- iov[seg].iov_len = len;
- }
- ret += len;
- }
-out:
- *ret_pointer = iov;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-#endif
-
static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
loff_t *pos, rwf_t flags)
{
@@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
return do_pwritev(fd, vec, vlen, pos, flags);
}
+/*
+ * Various compat syscalls. Note that they all pretend to take a native
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
+ * in_compat_syscall().
+ */
#ifdef CONFIG_COMPAT
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
- if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
- return ret;
-}
-
-static size_t do_compat_readv(compat_ulong_t fd,
- const struct compat_iovec __user *vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-
-}
-
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
-{
- return do_compat_readv(fd, vec, vlen, 0);
-}
-
-static long do_compat_preadv64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
-}
-
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
-}
-
-static size_t compat_writev(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
- if (ret > 0)
- add_wchar(current, ret);
- inc_syscw(current);
- return ret;
-}
-
-static size_t do_compat_writev(compat_ulong_t fd,
- const struct compat_iovec __user* vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
-{
- return do_compat_writev(fd, vec, vlen, 0);
-}
-
-static long do_compat_pwritev64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
-
-#endif
+#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1509775da040..c76d563dec0e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1551,11 +1551,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
* set version 1, version 2 could be used too, because stat data
* key is the same in both versions
*/
- key.version = KEY_FORMAT_3_5;
- key.on_disk_key.k_dir_id = dirino;
- key.on_disk_key.k_objectid = inode->i_ino;
- key.on_disk_key.k_offset = 0;
- key.on_disk_key.k_type = 0;
+ _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
/* look for the object's stat data */
retval = search_item(inode->i_sb, &key, &path_to_sd);
@@ -2163,7 +2159,8 @@ out_end_trans:
out_inserted_sd:
clear_nlink(inode);
th->t_trans_id = 0; /* so the caller can't use this handle later */
- unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
iput(inode);
return err;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a6bce5b1fb1d..1b9c7a387dc7 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1258,6 +1258,10 @@ static int reiserfs_parse_options(struct super_block *s,
"turned on.");
return 0;
}
+ if (qf_names[qtype] !=
+ REISERFS_SB(s)->s_qf_names[qtype])
+ kfree(qf_names[qtype]);
+ qf_names[qtype] = NULL;
if (*arg) { /* Some filename specified? */
if (REISERFS_SB(s)->s_qf_names[qtype]
&& strcmp(REISERFS_SB(s)->s_qf_names[qtype],
@@ -1287,10 +1291,6 @@ static int reiserfs_parse_options(struct super_block *s,
else
*mount_options |= 1 << REISERFS_GRPQUOTA;
} else {
- if (qf_names[qtype] !=
- REISERFS_SB(s)->s_qf_names[qtype])
- kfree(qf_names[qtype]);
- qf_names[qtype] = NULL;
if (qtype == USRQUOTA)
*mount_options &= ~(1 << REISERFS_USRQUOTA);
else
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 28b241cd6987..fe63a7c3e0da 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -674,6 +674,13 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
if (get_inode_sd_version(inode) == STAT_DATA_V1)
return -EOPNOTSUPP;
+ /*
+ * priv_root needn't be initialized during mount so allow initial
+ * lookups to succeed.
+ */
+ if (!REISERFS_SB(inode->i_sb)->priv_root)
+ return 0;
+
dentry = xattr_lookup(inode, name, XATTR_REPLACE);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e582d001f792..b1b7d3f5752f 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
}
i->i_mode = mode;
+ i->i_blocks = (i->i_size + 511) >> 9;
unlock_new_inode(i);
return i;
diff --git a/fs/splice.c b/fs/splice.c
index d7c8a7c4db07..70cc52af780b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
-#include <linux/compat.h>
#include <linux/sched/signal.h>
#include "internal.h"
@@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
return 1;
}
+/* We know we have a pipe buffer, but maybe it's empty? */
+static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
+{
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+
+ if (unlikely(!buf->len)) {
+ pipe_buf_release(pipe, buf);
+ pipe->tail = tail+1;
+ return true;
+ }
+
+ return false;
+}
+
/**
* splice_from_pipe_next - wait for some data to splice from
* @pipe: pipe to splice from
@@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
+repeat:
while (pipe_empty(pipe->head, pipe->tail)) {
if (!pipe->writers)
return 0;
@@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
sd->need_wakeup = false;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
+ if (eat_empty_buffer(pipe))
+ goto repeat;
+
return 1;
}
@@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
return -EAGAIN;
if (signal_pending(current))
return -ERESTARTSYS;
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
}
@@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type)
* Currently we punt and implement it as a normal copy, see pipe_to_user().
*
*/
-static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
-{
- if (unlikely(flags & ~SPLICE_F_ALL))
- return -EINVAL;
-
- if (!iov_iter_count(iter))
- return 0;
-
- if (iov_iter_rw(iter) == WRITE)
- return vmsplice_to_pipe(f, iter, flags);
- else
- return vmsplice_to_user(f, iter, flags);
-}
-
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
unsigned long, nr_segs, unsigned int, flags)
{
@@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct fd f;
int type;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
f = fdget(fd);
error = vmsplice_type(f, &type);
if (error)
@@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
- fdput(f);
- return error;
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
- unsigned int, nr_segs, unsigned int, flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t error;
- struct fd f;
- int type;
+ if (error < 0)
+ goto out_fdput;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ if (!iov_iter_count(&iter))
+ error = 0;
+ else if (iov_iter_rw(&iter) == WRITE)
+ error = vmsplice_to_pipe(f.file, &iter, flags);
+ else
+ error = vmsplice_to_user(f.file, &iter, flags);
- error = compat_import_iovec(type, iov32, nr_segs,
- ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
+ kfree(iov);
+out_fdput:
fdput(f);
return error;
}
-#endif
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
@@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -EAGAIN;
break;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
pipe_unlock(pipe);
@@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -ERESTARTSYS;
break;
}
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
pipe_unlock(pipe);
diff --git a/fs/super.c b/fs/super.c
index 904459b35119..a51c2083cd6b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index eb6897ab78e7..96d0da65e088 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
#include "sysfs.h"
@@ -707,3 +708,57 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
return 0;
}
EXPORT_SYMBOL_GPL(sysfs_change_owner);
+
+/**
+ * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer.
+ * @buf: start of PAGE_SIZE buffer.
+ * @fmt: format
+ * @...: optional arguments to @format
+ *
+ *
+ * Returns number of characters written to @buf.
+ */
+int sysfs_emit(char *buf, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ if (WARN(!buf || offset_in_page(buf),
+ "invalid sysfs_emit: buf:%p\n", buf))
+ return 0;
+
+ va_start(args, fmt);
+ len = vscnprintf(buf, PAGE_SIZE, fmt, args);
+ va_end(args);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(sysfs_emit);
+
+/**
+ * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer.
+ * @buf: start of PAGE_SIZE buffer.
+ * @at: offset in @buf to start write in bytes
+ * @at must be >= 0 && < PAGE_SIZE
+ * @fmt: format
+ * @...: optional arguments to @fmt
+ *
+ *
+ * Returns number of characters written starting at &@buf[@at].
+ */
+int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE,
+ "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at))
+ return 0;
+
+ va_start(args, fmt);
+ len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args);
+ va_end(args);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(sysfs_emit_at);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index cc5c0abfd536..b93b3cd10bfd 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -54,7 +54,7 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
* ubifs_prepare_auth_node - Prepare an authentication node
* @c: UBIFS file-system description object
* @node: the node to calculate a hash for
- * @hash: input hash of previous nodes
+ * @inhash: input hash of previous nodes
*
* This function prepares an authentication node for writing onto flash.
* It creates a HMAC from the given input hash and writes it to the node.
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 31288d8fa2ce..ebff43f8009c 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1123,6 +1123,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
err = PTR_ERR(dent);
if (err == -ENOENT)
break;
+ kfree(pdent);
return err;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d042942d8b2..155521e51ac5 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
struct ubifs_inode *ui;
bool encrypted = false;
- if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
- if (err) {
- ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
- return ERR_PTR(err);
- }
-
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-EPERM);
-
- encrypted = true;
- }
-
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
if (!inode)
@@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
current_time(inode);
inode->i_mapping->nrpages = 0;
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (err) {
+ ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+ goto out_iput;
+ }
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
case S_IFBLK:
case S_IFCHR:
inode->i_op = &ubifs_file_inode_operations;
- encrypted = false;
break;
default:
BUG();
@@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
if (c->highest_inum >= INUM_WATERMARK) {
spin_unlock(&c->cnt_lock);
ubifs_err(c, "out of inode numbers");
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(-EINVAL);
+ err = -EINVAL;
+ goto out_iput;
}
ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
(unsigned long)c->highest_inum, INUM_WATERMARK);
@@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
spin_unlock(&c->cnt_lock);
if (encrypted) {
- err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+ err = fscrypt_set_context(inode, NULL);
if (err) {
- ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(err);
+ ubifs_err(c, "fscrypt_set_context failed: %i", err);
+ goto out_iput;
}
}
return inode;
+
+out_iput:
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int dbg_check_name(const struct ubifs_info *c,
@@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
if (err)
return err;
- err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr);
if (err)
return err;
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 62cb3db44e6e..a4aaeea63893 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -57,10 +57,6 @@
/**
* switch_gc_head - switch the garbage collection journal head.
* @c: UBIFS file-system description object
- * @buf: buffer to write
- * @len: length of the buffer to write
- * @lnum: LEB number written is returned here
- * @offs: offset written is returned here
*
* This function switch the GC head to the next LEB which is reserved in
* @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 3df9be2c684c..4363d85a3fd4 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -134,7 +134,6 @@ static int setflags(struct inode *inode, int flags)
return err;
out_unlock:
- ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino);
mutex_unlock(&ui->ui_mutex);
ubifs_release_budget(c, &req);
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 4a5b06f8d812..091c2ad8f211 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -894,6 +894,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
if (err == -ENOENT)
break;
+ kfree(pxent);
goto out_release;
}
@@ -906,6 +907,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
ubifs_err(c, "dead directory entry '%s', error %d",
xent->name, err);
ubifs_ro_mode(c, err);
+ kfree(pxent);
kfree(xent);
goto out_release;
}
@@ -936,8 +938,6 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
inode->i_ino);
release_head(c, BASEHD);
- ubifs_add_auth_dirt(c, lnum);
-
if (last_reference) {
err = ubifs_tnc_remove_ino(c, inode->i_ino);
if (err)
@@ -947,6 +947,8 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
} else {
union ubifs_key key;
+ ubifs_add_auth_dirt(c, lnum);
+
ino_key_init(c, &key, inode->i_ino);
err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash);
}
@@ -1798,7 +1800,6 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
u8 hash[UBIFS_HASH_ARR_SZ];
dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
- ubifs_assert(c, host->i_nlink > 0);
ubifs_assert(c, inode->i_nlink > 0);
ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex));
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 2c294085ffed..0fb61956146d 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -173,6 +173,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
err = PTR_ERR(xent);
if (err == -ENOENT)
break;
+ kfree(pxent);
return err;
}
@@ -182,6 +183,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
xattr_orphan = orphan_add(c, xattr_inum, orphan);
if (IS_ERR(xattr_orphan)) {
+ kfree(pxent);
kfree(xent);
return PTR_ERR(xattr_orphan);
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index b69ffac7e415..2f8d8f4f411a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -931,8 +931,6 @@ out:
* validate_ref - validate a reference node.
* @c: UBIFS file-system description object
* @ref: the reference node to validate
- * @ref_lnum: LEB number of the reference node
- * @ref_offs: reference node offset
*
* This function returns %1 if a bud reference already exists for the LEB. %0 is
* returned if the reference node is new, otherwise %-EINVAL is returned if
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a2420c900275..cb3acfb7dd1f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1110,14 +1110,20 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
break;
}
case Opt_auth_key:
- c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL);
- if (!c->auth_key_name)
- return -ENOMEM;
+ if (!is_remount) {
+ c->auth_key_name = kstrdup(args[0].from,
+ GFP_KERNEL);
+ if (!c->auth_key_name)
+ return -ENOMEM;
+ }
break;
case Opt_auth_hash_name:
- c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL);
- if (!c->auth_hash_name)
- return -ENOMEM;
+ if (!is_remount) {
+ c->auth_hash_name = kstrdup(args[0].from,
+ GFP_KERNEL);
+ if (!c->auth_hash_name)
+ return -ENOMEM;
+ }
break;
case Opt_ignore:
break;
@@ -1141,6 +1147,18 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
return 0;
}
+/*
+ * ubifs_release_options - release mount parameters which have been dumped.
+ * @c: UBIFS file-system description object
+ */
+static void ubifs_release_options(struct ubifs_info *c)
+{
+ kfree(c->auth_key_name);
+ c->auth_key_name = NULL;
+ kfree(c->auth_hash_name);
+ c->auth_hash_name = NULL;
+}
+
/**
* destroy_journal - destroy journal data structures.
* @c: UBIFS file-system description object
@@ -1313,7 +1331,7 @@ static int mount_ubifs(struct ubifs_info *c)
err = ubifs_read_superblock(c);
if (err)
- goto out_free;
+ goto out_auth;
c->probing = 0;
@@ -1325,18 +1343,18 @@ static int mount_ubifs(struct ubifs_info *c)
ubifs_err(c, "'compressor \"%s\" is not compiled in",
ubifs_compr_name(c, c->default_compr));
err = -ENOTSUPP;
- goto out_free;
+ goto out_auth;
}
err = init_constants_sb(c);
if (err)
- goto out_free;
+ goto out_auth;
sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2;
c->cbuf = kmalloc(sz, GFP_NOFS);
if (!c->cbuf) {
err = -ENOMEM;
- goto out_free;
+ goto out_auth;
}
err = alloc_wbufs(c);
@@ -1611,6 +1629,8 @@ out_wbufs:
free_wbufs(c);
out_cbuf:
kfree(c->cbuf);
+out_auth:
+ ubifs_exit_authentication(c);
out_free:
kfree(c->write_reserve_buf);
kfree(c->bu.buf);
@@ -1650,8 +1670,7 @@ static void ubifs_umount(struct ubifs_info *c)
ubifs_lpt_free(c, 0);
ubifs_exit_authentication(c);
- kfree(c->auth_key_name);
- kfree(c->auth_hash_name);
+ ubifs_release_options(c);
kfree(c->cbuf);
kfree(c->rcvrd_mst_node);
kfree(c->mst_node);
@@ -2177,6 +2196,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
c->vi.vol_id);
if (err)
goto out_close;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
@@ -2219,6 +2240,7 @@ out_umount:
out_unlock:
mutex_unlock(&c->umount_mutex);
out_close:
+ ubifs_release_options(c);
ubi_close_volume(c->ubi);
out:
return err;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f609f6cdde70..894f1ab14616 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -360,7 +360,6 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
/**
* lnc_free - remove a leaf node from the leaf node cache.
* @zbr: zbranch of leaf node
- * @node: leaf node
*/
static void lnc_free(struct ubifs_zbranch *zbr)
{
@@ -2885,6 +2884,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
err = PTR_ERR(xent);
if (err == -ENOENT)
break;
+ kfree(pxent);
return err;
}
@@ -2898,6 +2898,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
fname_len(&nm) = le16_to_cpu(xent->nlen);
err = ubifs_tnc_remove_nm(c, &key1, &nm);
if (err) {
+ kfree(pxent);
kfree(xent);
return err;
}
@@ -2906,6 +2907,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
highest_ino_key(c, &key2, xattr_inum);
err = ubifs_tnc_remove_range(c, &key1, &key2);
if (err) {
+ kfree(pxent);
kfree(xent);
return err;
}
@@ -3466,7 +3468,7 @@ out_unlock:
/**
* dbg_check_inode_size - check if inode size is correct.
* @c: UBIFS file-system description object
- * @inum: inode number
+ * @inode: inode to check
* @size: inode size
*
* This function makes sure that the inode size (@size) is correct and it does
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 9aefbb60074f..a0b9b349efe6 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -522,6 +522,7 @@ int ubifs_purge_xattrs(struct inode *host)
xent->name, err);
ubifs_ro_mode(c, err);
kfree(pxent);
+ kfree(xent);
return err;
}
@@ -531,6 +532,7 @@ int ubifs_purge_xattrs(struct inode *host)
err = remove_xattr(c, host, xino, &nm);
if (err) {
kfree(pxent);
+ kfree(xent);
iput(xino);
ubifs_err(c, "cannot remove xattr, error %d", err);
return err;
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index d9523013096f..73720320f0ab 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -34,7 +34,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
fibh->soffset = fibh->eoffset;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- fi = udf_get_fileident(iinfo->i_ext.i_data -
+ fi = udf_get_fileident(iinfo->i_data -
(iinfo->i_efe ?
sizeof(struct extendedFileEntry) :
sizeof(struct fileEntry)),
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 628941a6b79a..ad8eefad27d7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -50,7 +50,7 @@ static void __udf_adinicb_readpage(struct page *page)
* So just sample it once and use the same value everywhere.
*/
kaddr = kmap_atomic(page);
- memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, isize);
+ memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize);
memset(kaddr + isize, 0, PAGE_SIZE - isize);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -76,8 +76,7 @@ static int udf_adinicb_writepage(struct page *page,
BUG_ON(!PageLocked(page));
kaddr = kmap_atomic(page);
- memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
- i_size_read(inode));
+ memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode));
SetPageUptodate(page);
kunmap_atomic(kaddr);
mark_inode_dirty(inode);
@@ -215,7 +214,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
case UDF_GETEABLOCK:
return copy_to_user((char __user *)arg,
- UDF_I(inode)->i_ext.i_data,
+ UDF_I(inode)->i_data,
UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
default:
return -ENOIOCTLCMD;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 0adb40718a5d..84ed23edebfd 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -67,16 +67,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
iinfo->i_efe = 1;
if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
- iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry),
- GFP_KERNEL);
+ iinfo->i_data = kzalloc(inode->i_sb->s_blocksize -
+ sizeof(struct extendedFileEntry),
+ GFP_KERNEL);
} else {
iinfo->i_efe = 0;
- iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
- sizeof(struct fileEntry),
- GFP_KERNEL);
+ iinfo->i_data = kzalloc(inode->i_sb->s_blocksize -
+ sizeof(struct fileEntry),
+ GFP_KERNEL);
}
- if (!iinfo->i_ext.i_data) {
+ if (!iinfo->i_data) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index adaba8e8b326..bb89c3e43212 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -139,23 +139,26 @@ void udf_evict_inode(struct inode *inode)
struct udf_inode_info *iinfo = UDF_I(inode);
int want_delete = 0;
- if (!inode->i_nlink && !is_bad_inode(inode)) {
- want_delete = 1;
- udf_setsize(inode, 0);
- udf_update_inode(inode, IS_SYNC(inode));
+ if (!is_bad_inode(inode)) {
+ if (!inode->i_nlink) {
+ want_delete = 1;
+ udf_setsize(inode, 0);
+ udf_update_inode(inode, IS_SYNC(inode));
+ }
+ if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
+ inode->i_size != iinfo->i_lenExtents) {
+ udf_warn(inode->i_sb,
+ "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+ inode->i_ino, inode->i_mode,
+ (unsigned long long)inode->i_size,
+ (unsigned long long)iinfo->i_lenExtents);
+ }
}
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
- inode->i_size != iinfo->i_lenExtents) {
- udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
- inode->i_ino, inode->i_mode,
- (unsigned long long)inode->i_size,
- (unsigned long long)iinfo->i_lenExtents);
- }
- kfree(iinfo->i_ext.i_data);
- iinfo->i_ext.i_data = NULL;
+ kfree(iinfo->i_data);
+ iinfo->i_data = NULL;
udf_clear_extent_cache(inode);
if (want_delete) {
udf_free_inode(inode);
@@ -285,14 +288,14 @@ int udf_expand_file_adinicb(struct inode *inode)
kaddr = kmap_atomic(page);
memset(kaddr + iinfo->i_lenAlloc, 0x00,
PAGE_SIZE - iinfo->i_lenAlloc);
- memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
+ memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr,
iinfo->i_lenAlloc);
flush_dcache_page(page);
SetPageUptodate(page);
kunmap_atomic(kaddr);
}
down_write(&iinfo->i_data_sem);
- memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
+ memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00,
iinfo->i_lenAlloc);
iinfo->i_lenAlloc = 0;
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -308,8 +311,7 @@ int udf_expand_file_adinicb(struct inode *inode)
lock_page(page);
down_write(&iinfo->i_data_sem);
kaddr = kmap_atomic(page);
- memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
- inode->i_size);
+ memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
kunmap_atomic(kaddr);
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
@@ -396,8 +398,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode,
}
mark_buffer_dirty_inode(dbh, inode);
- memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0,
- iinfo->i_lenAlloc);
+ memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc);
iinfo->i_lenAlloc = 0;
eloc.logicalBlockNum = *block;
eloc.partitionReferenceNum =
@@ -1260,7 +1261,7 @@ set_size:
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);
- memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+ memset(iinfo->i_data + iinfo->i_lenEAttr + newsize,
0x00, bsize - newsize -
udf_file_entry_alloc_offset(inode));
iinfo->i_lenAlloc = newsize;
@@ -1411,7 +1412,7 @@ reread:
sizeof(struct extendedFileEntry));
if (ret)
goto out;
- memcpy(iinfo->i_ext.i_data,
+ memcpy(iinfo->i_data,
bh->b_data + sizeof(struct extendedFileEntry),
bs - sizeof(struct extendedFileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
@@ -1420,7 +1421,7 @@ reread:
ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
if (ret)
goto out;
- memcpy(iinfo->i_ext.i_data,
+ memcpy(iinfo->i_data,
bh->b_data + sizeof(struct fileEntry),
bs - sizeof(struct fileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
@@ -1433,7 +1434,7 @@ reread:
sizeof(struct unallocSpaceEntry));
if (ret)
goto out;
- memcpy(iinfo->i_ext.i_data,
+ memcpy(iinfo->i_data,
bh->b_data + sizeof(struct unallocSpaceEntry),
bs - sizeof(struct unallocSpaceEntry));
return 0;
@@ -1614,8 +1615,8 @@ out:
static int udf_alloc_i_data(struct inode *inode, size_t size)
{
struct udf_inode_info *iinfo = UDF_I(inode);
- iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
- if (!iinfo->i_ext.i_data)
+ iinfo->i_data = kmalloc(size, GFP_KERNEL);
+ if (!iinfo->i_data)
return -ENOMEM;
return 0;
}
@@ -1706,7 +1707,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
- iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
+ iinfo->i_data, inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
crclen = sizeof(struct unallocSpaceEntry);
@@ -1772,7 +1773,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
if (iinfo->i_efe == 0) {
memcpy(bh->b_data + sizeof(struct fileEntry),
- iinfo->i_ext.i_data,
+ iinfo->i_data,
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
@@ -1791,7 +1792,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
crclen = sizeof(struct fileEntry);
} else {
memcpy(bh->b_data + sizeof(struct extendedFileEntry),
- iinfo->i_ext.i_data,
+ iinfo->i_data,
inode->i_sb->s_blocksize -
sizeof(struct extendedFileEntry));
efe->objectSize =
@@ -2087,7 +2088,7 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
struct udf_inode_info *iinfo = UDF_I(inode);
if (!epos->bh)
- ptr = iinfo->i_ext.i_data + epos->offset -
+ ptr = iinfo->i_data + epos->offset -
udf_file_entry_alloc_offset(inode) +
iinfo->i_lenEAttr;
else
@@ -2179,7 +2180,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
if (!epos->bh) {
if (!epos->offset)
epos->offset = udf_file_entry_alloc_offset(inode);
- ptr = iinfo->i_ext.i_data + epos->offset -
+ ptr = iinfo->i_data + epos->offset -
udf_file_entry_alloc_offset(inode) +
iinfo->i_lenEAttr;
alen = udf_file_entry_alloc_offset(inode) +
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 401e64cde1be..eab94527340d 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -52,9 +52,9 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
uint16_t crclen;
struct udf_inode_info *iinfo = UDF_I(inode);
- ea = iinfo->i_ext.i_data;
+ ea = iinfo->i_data;
if (iinfo->i_lenEAttr) {
- ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+ ad = iinfo->i_data + iinfo->i_lenEAttr;
} else {
ad = ea;
size += sizeof(struct extendedAttrHeaderDesc);
@@ -153,7 +153,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
uint32_t offset;
struct udf_inode_info *iinfo = UDF_I(inode);
- ea = iinfo->i_ext.i_data;
+ ea = iinfo->i_data;
if (iinfo->i_lenEAttr) {
struct extendedAttrHeaderDesc *eahd;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 77b6d89b9bcd..e169d8fe35b5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -460,8 +460,7 @@ add:
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
block = dinfo->i_location.logicalBlockNum;
fi = (struct fileIdentDesc *)
- (dinfo->i_ext.i_data +
- fibh->soffset -
+ (dinfo->i_data + fibh->soffset -
udf_ext0_offset(dir) +
dinfo->i_lenEAttr);
} else {
@@ -940,7 +939,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
mark_buffer_dirty_inode(epos.bh, inode);
ea = epos.bh->b_data + udf_ext0_offset(inode);
} else
- ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+ ea = iinfo->i_data + iinfo->i_lenEAttr;
eoffset = sb->s_blocksize - udf_ext0_offset(inode);
pc = (struct pathComponent *)ea;
@@ -1120,7 +1119,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = -EIO;
if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
dir_fi = udf_get_fileident(
- old_iinfo->i_ext.i_data -
+ old_iinfo->i_data -
(old_iinfo->i_efe ?
sizeof(struct extendedFileEntry) :
sizeof(struct fileEntry)),
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 090baff83990..4cbf40575965 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -65,7 +65,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
}
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
+ loc = le32_to_cpu(((__le32 *)(iinfo->i_data +
vdata->s_start_offset))[block]);
goto translate;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1c42f544096d..faf2017ada11 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -168,7 +168,7 @@ static void init_once(void *foo)
{
struct udf_inode_info *ei = (struct udf_inode_info *)foo;
- ei->i_ext.i_data = NULL;
+ ei->i_data = NULL;
inode_init_once(&ei->vfs_inode);
}
@@ -854,7 +854,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
uint8_t *outstr;
struct buffer_head *bh;
uint16_t ident;
- int ret = -ENOMEM;
+ int ret;
struct timestamp *ts;
outstr = kmalloc(128, GFP_NOFS);
@@ -1006,18 +1006,10 @@ int udf_compute_nr_groups(struct super_block *sb, u32 partition)
static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
{
struct udf_bitmap *bitmap;
- int nr_groups;
- int size;
-
- nr_groups = udf_compute_nr_groups(sb, index);
- size = sizeof(struct udf_bitmap) +
- (sizeof(struct buffer_head *) * nr_groups);
-
- if (size <= PAGE_SIZE)
- bitmap = kzalloc(size, GFP_KERNEL);
- else
- bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
+ int nr_groups = udf_compute_nr_groups(sb, index);
+ bitmap = kvzalloc(struct_size(bitmap, s_block_bitmap, nr_groups),
+ GFP_KERNEL);
if (!bitmap)
return NULL;
@@ -1210,7 +1202,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
vat20 = (struct virtualAllocationTable20 *)bh->b_data;
} else {
vat20 = (struct virtualAllocationTable20 *)
- vati->i_ext.i_data;
+ vati->i_data;
}
map->s_type_specific.s_virtual.s_start_offset =
@@ -1353,6 +1345,12 @@ static int udf_load_sparable_map(struct super_block *sb,
(int)spm->numSparingTables);
return -EIO;
}
+ if (le32_to_cpu(spm->sizeSparingTable) > sb->s_blocksize) {
+ udf_err(sb, "error loading logical volume descriptor: "
+ "Too big sparing table size (%u)\n",
+ le32_to_cpu(spm->sizeSparingTable));
+ return -EIO;
+ }
for (i = 0; i < spm->numSparingTables; i++) {
loc = le32_to_cpu(spm->locSparingTable[i]);
@@ -1698,7 +1696,8 @@ static noinline int udf_process_sequence(
"Pointers (max %u supported)\n",
UDF_MAX_TD_NESTING);
brelse(bh);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
vdp = (struct volDescPtr *)bh->b_data;
@@ -1718,7 +1717,8 @@ static noinline int udf_process_sequence(
curr = get_volume_descriptor_record(ident, bh, &data);
if (IS_ERR(curr)) {
brelse(bh);
- return PTR_ERR(curr);
+ ret = PTR_ERR(curr);
+ goto out;
}
/* Descriptor we don't care about? */
if (!curr)
@@ -1740,28 +1740,31 @@ static noinline int udf_process_sequence(
*/
if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) {
udf_err(sb, "Primary Volume Descriptor not found!\n");
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block);
if (ret < 0)
- return ret;
+ goto out;
if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) {
ret = udf_load_logicalvol(sb,
data.vds[VDS_POS_LOGICAL_VOL_DESC].block,
fileset);
if (ret < 0)
- return ret;
+ goto out;
}
/* Now handle prevailing Partition Descriptors */
for (i = 0; i < data.num_part_descs; i++) {
ret = udf_load_partdesc(sb, data.part_descs_loc[i].rec.block);
if (ret < 0)
- return ret;
+ goto out;
}
-
- return 0;
+ ret = 0;
+out:
+ kfree(data.part_descs_loc);
+ return ret;
}
/*
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 25ff91c7e94a..c973db239604 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -122,7 +122,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
down_read(&iinfo->i_data_sem);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+ symlink = iinfo->i_data + iinfo->i_lenEAttr;
} else {
bh = sb_bread(inode->i_sb, pos);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4245d1f63258..06ff7006b822 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -45,11 +45,7 @@ struct udf_inode_info {
unsigned i_strat4096 : 1;
unsigned i_streamdir : 1;
unsigned reserved : 25;
- union {
- struct short_ad *i_sad;
- struct long_ad *i_lad;
- __u8 *i_data;
- } i_ext;
+ __u8 *i_data;
struct kernel_lb_addr i_locStreamdir;
__u64 i_lenStreams;
struct rw_semaphore i_data_sem;
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 2a878b739115..dc25823bfed9 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -6,6 +6,7 @@
#include <linux/parser.h>
#include <linux/errno.h>
#include <linux/unicode.h>
+#include <linux/stringhash.h>
#include "utf8n.h"
@@ -122,9 +123,29 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
-
EXPORT_SYMBOL(utf8_casefold);
+int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ int c;
+ unsigned long hash = init_name_hash(salt);
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ while ((c = utf8byte(&cur))) {
+ if (c < 0)
+ return -EINVAL;
+ hash = partial_name_hash((unsigned char)c, hash);
+ }
+ str->hash = end_name_hash(hash);
+ return 0;
+}
+EXPORT_SYMBOL(utf8_casefold_hash);
+
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0e4a3837da52..000b457ad087 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
- /* no task can run (and in turn coredump) yet */
- VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
- bool still_valid;
WRITE_ONCE(ctx->released, true);
@@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_lock for writing.
*/
mmap_write_lock(mm);
- still_valid = mmget_still_valid(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
- if (still_valid) {
- prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX);
- if (prev)
- vma = prev;
- else
- prev = vma;
- }
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- if (!mmget_still_valid(mm))
- goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- if (!mmget_still_valid(mm))
- goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index dd147b490982..4d569f14a8d8 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -134,7 +134,7 @@ try_next_entry:
d_type = vboxsf_get_d_type(info->info.attr.mode);
/*
- * On 32 bit systems pos is 64 signed, while ino is 32 bit
+ * On 32-bit systems pos is 64-bit signed, while ino is 32-bit
* unsigned so fake_ino may overflow, check for this.
*/
if ((ino_t)(ctx->pos + 1) != (u64)(ctx->pos + 1)) {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 8fe03b4a0d2b..d7816c01a4f6 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id);
if (err)
goto fail_free;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
/* Turn source into a shfl_string and map the folder */
size = strlen(fc->source) + 1;
@@ -384,7 +386,7 @@ fail_nomem:
static int vboxsf_parse_monolithic(struct fs_context *fc, void *data)
{
- char *options = data;
+ unsigned char *options = data;
if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 &&
options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 &&
diff --git a/fs/xattr.c b/fs/xattr.c
index 386b45676d7e..cd7a563e8bcd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
}
/**
- * __vfs_setxattr_locked: set an extended attribute while holding the inode
+ * __vfs_setxattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - xattr name to set
- * @value - value to set @name to
- * @size - size of @value
- * @flags - flags to pass into filesystem operations
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: xattr name to set
+ * @value: value to set @name to
+ * @size: size of @value
+ * @flags: flags to pass into filesystem operations
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
@@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
EXPORT_SYMBOL(__vfs_removexattr);
/**
- * __vfs_removexattr_locked: set an extended attribute while holding the inode
+ * __vfs_removexattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - name of xattr to remove
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: name of xattr to remove
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index e685299eb3d2..9fac5ea8d0e4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,31 @@ config XFS_FS
system of your root partition is compiled as a module, you'll need
to use an initial ramdisk (initrd) to boot.
+config XFS_SUPPORT_V4
+ bool "Support deprecated V4 (crc=0) format"
+ depends on XFS_FS
+ default y
+ help
+ The V4 filesystem format lacks certain features that are supported
+ by the V5 format, such as metadata checksumming, strengthened
+ metadata verification, and the ability to store timestamps past the
+ year 2038. Because of this, the V4 format is deprecated. All users
+ should upgrade by backing up their files, reformatting, and restoring
+ from the backup.
+
+ Administrators and users can detect a V4 filesystem by running
+ xfs_info against a filesystem mountpoint and checking for a string
+ beginning with "crc=". If the string "crc=0" is found, the
+ filesystem is a V4 filesystem. If no such string is found, please
+ upgrade xfsprogs to the latest version and try again.
+
+ This option will become default N in September 2025. Support for the
+ V4 format will be removed entirely in September 2030. Distributors
+ can say N here to withdraw support earlier.
+
+ To continue supporting the old V4 format (crc=0), say Y.
+ To close off an attack surface, say N.
+
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index e841ed781a25..e986b95d94c9 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
return ptr;
return __kmem_vmalloc(size, flags);
}
-
-void *
-kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_realloc(newsize, flags, _RET_IP_);
-
- do {
- ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
- current->comm, current->pid,
- newsize, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8e8555817e6d..38007117697e 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags)
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
@@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
return kmem_alloc(size, flags | KM_ZERO);
}
-static inline void *
-kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc_large(size, flags | KM_ZERO);
-}
-
/*
* Zone interfaces
*/
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 8cf73fe4338e..9331f3516afa 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -333,6 +333,11 @@ xfs_agiblock_init(
}
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(1);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ agi->agi_fblocks = cpu_to_be32(1);
+ }
}
typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 2e055c079f39..fd8e6418a0d3 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -428,7 +428,7 @@ xfs_attr_set(
*/
if (XFS_IFORK_Q(dp) == 0) {
int sf_size = sizeof(struct xfs_attr_sf_hdr) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen,
+ xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
@@ -523,6 +523,14 @@ out_trans_cancel:
* External routines when attribute list is inside the inode
*========================================================================*/
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+ struct xfs_attr_shortform *sf;
+
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ return be16_to_cpu(sf->hdr.totsize);
+}
+
/*
* Add a name to the shortform attribute list structure
* This is the external routine.
@@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return -ENOSPC;
- newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
- newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ newsize = xfs_attr_sf_totsize(args->dp);
+ newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
if (!forkoff)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 305d4bc07337..bb128db220ac 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -684,9 +684,9 @@ xfs_attr_sf_findname(
sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
end = sf->hdr.count;
- for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+ for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
base += size, i++) {
- size = XFS_ATTR_SF_ENTSIZE(sfe);
+ size = xfs_attr_sf_entsize(sfe);
if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
continue;
@@ -728,15 +728,15 @@ xfs_attr_shortform_add(
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
ASSERT(0);
offset = (char *)sfe - (char *)sf;
- size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
@@ -787,12 +787,12 @@ xfs_attr_shortform_remove(
dp = args->dp;
mp = dp->i_mount;
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
if (error != -EEXIST)
return error;
- size = XFS_ATTR_SF_ENTSIZE(sfe);
+ size = xfs_attr_sf_entsize(sfe);
/*
* Fix up the attribute fork data, covering the hole
@@ -837,8 +837,8 @@ xfs_attr_shortform_remove(
int
xfs_attr_shortform_lookup(xfs_da_args_t *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
int i;
struct xfs_ifork *ifp;
@@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
return -EEXIST;
@@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue(
int i;
ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
return xfs_attr_copy_value(args,
@@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf(
dp = args->dp;
ifp = dp->i_afp;
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (xfs_attr_shortform_t *)tmpbuffer;
+ sf = (struct xfs_attr_shortform *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf(
ASSERT(error != -ENOSPC);
if (error)
goto out;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
*leaf_bp = bp;
@@ -992,9 +992,8 @@ xfs_attr_shortform_allfit(
return 0;
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return 0;
- bytes += sizeof(struct xfs_attr_sf_entry) - 1
- + name_loc->namelen
- + be16_to_cpu(name_loc->valuelen);
+ bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
@@ -1039,7 +1038,7 @@ xfs_attr_shortform_verify(
* xfs_attr_sf_entry is defined with a 1-byte variable
* array at the end, so we must subtract that off.
*/
- if (((char *)sfep + sizeof(*sfep) - 1) >= endp)
+ if (((char *)sfep + sizeof(*sfep)) >= endp)
return __this_address;
/* Don't allow names with known bad length. */
@@ -1051,7 +1050,7 @@ xfs_attr_shortform_verify(
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep);
+ next_sfep = xfs_attr_sf_nextentry(sfep);
if ((char *)next_sfep > endp)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 3f80cede7406..48d8e9caf86f 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -96,8 +96,6 @@ xfs_attr3_rmt_verify(
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return __this_address;
if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index bb004fb7944a..37578b369d9b 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -13,7 +13,6 @@
* to fit into the literal area of the inode.
*/
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
/*
* We generate this then sort it, attr_list() must return things in hash-order.
@@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort {
unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
-#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
- (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
-#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
- ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
-#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
- ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
-#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
- (be16_to_cpu(((xfs_attr_shortform_t *) \
- ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+ return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+ return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 1b0a01b06a05..d9a692484eae 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5046,20 +5046,25 @@ xfs_bmap_del_extent_real(
flags = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
xfs_filblks_t len;
xfs_extlen_t mod;
- bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
&mod);
ASSERT(mod == 0);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
+ if (!(bflags & XFS_BMAPI_REMAP)) {
+ xfs_fsblock_t bno;
+
+ bno = div_u64_rem(del->br_startblock,
+ mp->m_sb.sb_rextsize, &mod);
+ ASSERT(mod == 0);
+
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ }
+
do_fx = 0;
nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 059ac108b1b3..b876b44c0204 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -15,8 +15,8 @@
*/
#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
typedef struct xfs_da_blkinfo {
__be32 forw; /* previous block in list */
@@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo {
*/
#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
-#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */
struct xfs_da3_blkinfo {
/*
@@ -61,7 +61,7 @@ struct xfs_da3_blkinfo {
* Since we have duplicate keys, use a binary search but always follow
* all match in the block, not just the first match found.
*/
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
typedef struct xfs_da_node_hdr {
struct xfs_da_blkinfo info; /* block type, links, etc. */
@@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
/*
* Entries are packed toward the top as tight as possible.
*/
-typedef struct xfs_attr_shortform {
+struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
@@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform {
uint8_t namelen; /* actual length of name (no NULL) */
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[1]; /* name & value bytes concatenated */
+ uint8_t nameval[]; /* name & value bytes concatenated */
} list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
+};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
@@ -746,14 +746,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+ nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+ nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index d8f586256add..eff4a127188e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,8 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Deferred Operations in XFS
@@ -186,8 +188,9 @@ xfs_defer_create_intent(
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
- dfp->dfp_count, sort);
+ if (!dfp->dfp_intent)
+ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+ dfp->dfp_count, sort);
}
/*
@@ -312,22 +315,6 @@ xfs_defer_trans_roll(
}
/*
- * Reset an already used dfops after finish.
- */
-static void
-xfs_defer_reset(
- struct xfs_trans *tp)
-{
- ASSERT(list_empty(&tp->t_dfops));
-
- /*
- * Low mode state transfers across transaction rolls to mirror dfops
- * lifetime. Clear it now that dfops is reset.
- */
- tp->t_flags &= ~XFS_TRANS_LOWMODE;
-}
-
-/*
* Free up any items left in the list.
*/
static void
@@ -360,6 +347,58 @@ xfs_defer_cancel_list(
}
/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
@@ -390,6 +429,7 @@ xfs_defer_finish_one(
list_add(li, &dfp->dfp_work);
dfp->dfp_count++;
dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
xfs_defer_create_intent(tp, dfp, false);
}
@@ -428,13 +468,27 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+ /*
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
+ */
xfs_defer_create_intents(*tp);
- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
error = xfs_defer_trans_roll(tp);
if (error)
goto out_shutdown;
+ /* Possibly relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
error = xfs_defer_finish_one(*tp, dfp);
@@ -475,7 +529,10 @@ xfs_defer_finish(
return error;
}
}
- xfs_defer_reset(*tp);
+
+ /* Reset LOWMODE now that we've finished all the dfops. */
+ ASSERT(list_empty(&(*tp)->t_dfops));
+ (*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -549,6 +606,139 @@ xfs_defer_move(
* that behavior.
*/
dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+ stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip)
+{
+ struct xfs_defer_capture *dfc;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ xfs_defer_create_intents(tp);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ /*
+ * Grab an extra reference to this inode and attach it to the capture
+ * structure.
+ */
+ if (capture_ip) {
+ ihold(VFS_I(capture_ip));
+ dfc->dfc_capture_ip = capture_ip;
+ }
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_release(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+ if (dfc->dfc_capture_ip)
+ xfs_irele(dfc->dfc_capture_ip);
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp, capture_ip);
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_release(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp)
+{
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock and join the captured inode to the new transaction. */
+ if (dfc->dfc_capture_ip) {
+ xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
+ }
+ *captured_ipp = dfc->dfc_capture_ip;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
- xfs_defer_reset(stp);
+ kmem_free(dfc);
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 6b2ca580f2b0..05472f71fffe 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -8,6 +8,7 @@
struct xfs_btree_cur;
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -63,4 +64,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ /*
+ * An inode reference that must be maintained to complete the deferred
+ * work.
+ */
+ struct xfs_inode *dfc_capture_ip;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct xfs_inode *capture_ip, struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp);
+void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5a2db00b9d5f..6766417d5ba4 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -69,6 +69,13 @@ xfs_dquot_verify(
ddq_type != XFS_DQTYPE_GROUP)
return __this_address;
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
+ return __this_address;
+
if (id != -1 && id != be32_to_cpu(ddq->d_id))
return __this_address;
@@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+ struct xfs_disk_dquot *ddq,
+ __be32 dtimer)
+{
+ uint32_t t = be32_to_cpu(dtimer);
+
+ if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+ return xfs_dq_bigtime_to_unix(t);
+
+ return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+ struct xfs_dquot *dqp,
+ time64_t timer)
+{
+ uint32_t t = timer;
+
+ if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+ t = xfs_dq_unix_to_bigtime(timer);
+
+ return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 31b7ece985bb..dd764da08f6f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -449,10 +449,12 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
- XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID)
+ XFS_SB_FEAT_INCOMPAT_META_UUID| \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
}
+static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME);
+}
+
+/*
+ * Inode btree block counter. We record the number of inobt and finobt blocks
+ * in the AGI header so that we can skip the finobt walk at mount time when
+ * setting up per-AG reservations.
+ */
+static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT);
+}
+
/*
* end of superblock version macros
*/
@@ -765,6 +786,9 @@ typedef struct xfs_agi {
__be32 agi_free_root; /* root of the free inode btree */
__be32 agi_free_level;/* levels in free inode btree */
+ __be32 agi_iblocks; /* inobt blocks used */
+ __be32 agi_fblocks; /* finobt blocks used */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
@@ -785,7 +809,8 @@ typedef struct xfs_agi {
#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
#define XFS_AGI_FREE_ROOT (1 << 11)
#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_NUM_BITS_R2 13
+#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
@@ -831,10 +856,87 @@ struct xfs_agfl {
ASSERT(xfs_daddr_to_agno(mp, d) == \
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-typedef struct xfs_timestamp {
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system. Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter. This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
-} xfs_timestamp_t;
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN ((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul 2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+ return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
/*
* On-disk inode structure.
@@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_ANY \
- (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+ XFS_DIFLAG2_BIGTIME)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
/*
* Inode number format:
@@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DQTYPE_USER 0x01 /* user dquot record */
#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
+#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
XFS_DQTYPE_PROJ | \
XFS_DQTYPE_GROUP)
-#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK)
+#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \
+ XFS_DQTYPE_BIGTIME)
+
+/*
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set. Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps. The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds. A value of zero means to use the default period.
+ */
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan 1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul 2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514. This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U)
+
+/*
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT (2)
+#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+ /*
+ * Round the expiration timestamp up to the nearest bigtime timestamp
+ * that we can store, to give users the most time to fix problems.
+ */
+ return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+ XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years. These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN ((int64_t)0)
+#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
/*
* This is the main portion of the on-disk representation of quota information
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 84bcffa87753..2a2e3cfd94f0 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a6b37db55169..974e71bc4a3a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_unlinked),
offsetof(xfs_agi_t, agi_free_root),
offsetof(xfs_agi_t, agi_free_level),
+ offsetof(xfs_agi_t, agi_iblocks),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
@@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry(
uint64_t icount;
uint inodes;
+ igeo->new_diflags2 = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 3c8aebc36e64..cc919a2ee870 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -67,6 +67,25 @@ xfs_finobt_set_root(
XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
}
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+ struct xfs_btree_cur *cur,
+ int howmuch)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb))
+ return;
+
+ if (cur->bc_btnum == XFS_BTNUM_FINO)
+ be32_add_cpu(&agi->agi_fblocks, howmuch);
+ else if (cur->bc_btnum == XFS_BTNUM_INO)
+ be32_add_cpu(&agi->agi_iblocks, howmuch);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
STATIC int
__xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
@@ -102,6 +121,7 @@ __xfs_inobt_alloc_block(
new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
+ xfs_inobt_mod_blockcount(cur, 1);
return 0;
}
@@ -134,6 +154,7 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv);
@@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree(
{
struct xfs_agi *agi = agbp->b_addr;
struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+ int fields;
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
if (cur->bc_btnum == XFS_BTNUM_INO) {
+ fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
- xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
} else {
+ fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
agi->agi_free_level = cpu_to_be32(afake->af_levels);
- xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT |
- XFS_AGI_FREE_LEVEL);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
}
}
@@ -673,6 +704,28 @@ xfs_inobt_count_blocks(
return error;
}
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agi *agi;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ return error;
+
+ agi = agbp->b_addr;
+ *tree_blocks = be32_to_cpu(agi->agi_fblocks);
+ xfs_trans_brelse(tp, agbp);
+ return 0;
+}
+
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
@@ -690,7 +743,11 @@ xfs_finobt_calc_reserves(
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return 0;
- error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len);
+ else
+ error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO,
+ &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 52451809c478..b4164256993d 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -603,7 +603,7 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_u1.if_root = new;
cur->leaf = new;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8d5dd08eab75..c667c63f2cb0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -157,6 +157,36 @@ xfs_imap_to_bp(
return 0;
}
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+ struct timespec64 tv;
+ uint32_t n;
+
+ tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+ tv.tv_nsec = n;
+
+ return tv;
+}
+
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+ struct xfs_legacy_timestamp *lts;
+
+ if (xfs_dinode_has_bigtime(dip))
+ return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+ tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+ return tv;
+}
+
int
xfs_inode_from_disk(
struct xfs_inode *ip,
@@ -211,12 +241,9 @@ xfs_inode_from_disk(
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
- inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
- inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
- inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
- inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
- inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+ inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
to->di_size = be64_to_cpu(from->di_size);
to->di_nblocks = be64_to_cpu(from->di_nblocks);
@@ -229,8 +256,7 @@ xfs_inode_from_disk(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
- to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec);
- to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
to->di_flags2 = be64_to_cpu(from->di_flags2);
to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
@@ -252,6 +278,25 @@ out_destroy_data_fork:
return error;
}
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_timestamp *lts;
+ xfs_timestamp_t ts;
+
+ if (xfs_inode_has_bigtime(ip))
+ return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lts->t_sec = cpu_to_be32(tv.tv_sec);
+ lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+ return ts;
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -271,12 +316,9 @@ xfs_inode_to_disk(
to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
- to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
- to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
- to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
- to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
@@ -295,8 +337,7 @@ xfs_inode_to_disk(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec);
+ to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
@@ -310,58 +351,6 @@ xfs_inode_to_disk(
}
}
-void
-xfs_log_dinode_to_disk(
- struct xfs_log_dinode *from,
- struct xfs_dinode *to)
-{
- to->di_magic = cpu_to_be16(from->di_magic);
- to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_onlink = 0;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_nlink = cpu_to_be32(from->di_nlink);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-
- to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
- to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
- to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
- to->di_gen = cpu_to_be32(from->di_gen);
-
- if (from->di_version == 3) {
- to->di_changecount = cpu_to_be64(from->di_changecount);
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
- to->di_flags2 = cpu_to_be64(from->di_flags2);
- to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
- to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
- } else {
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
- }
-}
-
static xfs_failaddr_t
xfs_dinode_verify_fork(
struct xfs_dinode *dip,
@@ -568,6 +557,11 @@ xfs_dinode_verify(
if (fa)
return fa;
+ /* bigtime iflag can only happen on bigtime filesystems */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 6b08b9d060c2..ef5eaf33d146 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -17,7 +17,7 @@ struct xfs_dinode;
*/
struct xfs_icdinode {
uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_projid; /* owner's project id */
+ prid_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
@@ -32,6 +32,11 @@ struct xfs_icdinode {
struct timespec64 di_crtime; /* time created */
};
+static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd)
+{
+ return icd->di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
/*
* Inode location information. Stored in the inode and passed to
* xfs_imap_to_bp() to get a buffer and dinode for a given inode.
@@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
xfs_lsn_t lsn);
int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
-void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
- struct xfs_dinode *to);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
@@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
uint32_t cowextsize, uint16_t mode, uint16_t flags,
uint64_t flags2);
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+ return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+ const xfs_timestamp_t ts);
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0cf853d42d62..7575de5cecb1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -386,8 +386,8 @@ xfs_iroot_realloc(
cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
- ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_NOFS);
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -496,8 +496,8 @@ xfs_idata_realloc(
* in size so that it can be logged and stay on word boundaries.
* We enforce that here.
*/
- ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_NOFS);
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e3400c9c71cd..8bd00da6d2a4 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w)
* directly mirrors the xfs_dinode structure as it must contain all the same
* information.
*/
-typedef struct xfs_ictimestamp {
+typedef uint64_t xfs_ictimestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_ictimestamp {
int32_t t_sec; /* timestamp seconds */
int32_t t_nsec; /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
+};
/*
* Define the format of the inode core that is logged. This structure must be
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 641132d0e39d..3cca2bfe714c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -121,7 +121,6 @@ struct xlog_recover {
void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
const struct xfs_buf_ops *ops);
bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
-void xlog_recover_iodone(struct xfs_buf *bp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 076bdc7037ee..0f0af4e35032 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t;
#define XFS_DQTYPE_STRINGS \
{ XFS_DQTYPE_USER, "USER" }, \
{ XFS_DQTYPE_PROJ, "PROJ" }, \
- { XFS_DQTYPE_GROUP, "GROUP" }
+ { XFS_DQTYPE_GROUP, "GROUP" }, \
+ { XFS_DQTYPE_BIGTIME, "BIGTIME" }
/*
* flags for q_flags field in the dquot.
@@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, xfs_dqtype_t type);
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+ __be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 27c39268c31f..340c83f76c80 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2505,12 +2505,15 @@ xfs_rmap_map_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_MAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2521,12 +2524,15 @@ xfs_rmap_unmap_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_UNMAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/*
@@ -2543,12 +2549,15 @@ xfs_rmap_convert_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_CONVERT_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 1d9fa8a300f1..6c1aba16113c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1018,7 +1018,6 @@ xfs_rtalloc_query_range(
struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
- xfs_rtblock_t rem;
int is_free;
int error = 0;
@@ -1027,13 +1026,12 @@ xfs_rtalloc_query_range(
if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
low_rec->ar_startext == high_rec->ar_startext)
return 0;
- if (high_rec->ar_startext > mp->m_sb.sb_rextents)
- high_rec->ar_startext = mp->m_sb.sb_rextents;
+ high_rec->ar_startext = min(high_rec->ar_startext,
+ mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */
rtstart = low_rec->ar_startext;
- rem = high_rec->ar_startext - rtstart;
- while (rem) {
+ while (rtstart <= high_rec->ar_startext) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
&is_free);
@@ -1042,7 +1040,7 @@ xfs_rtalloc_query_range(
/* How long does the extent go for? */
error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startext - 1, &rtend);
+ high_rec->ar_startext, &rtend);
if (error)
break;
@@ -1055,7 +1053,6 @@ xfs_rtalloc_query_range(
break;
}
- rem -= rtend - rtstart + 1;
rtstart = rtend + 1;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ae9aaf1f34bf..5aeafa59ed27 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -954,7 +954,7 @@ xfs_log_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
@@ -1084,7 +1084,7 @@ xfs_sync_sb_buf(
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp);
+ bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1166,6 +1166,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
if (xfs_sb_version_hasreflink(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
+ if (xfs_sb_version_hasbigtime(sbp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
if (xfs_sb_version_hassector(sbp))
geo->logsectsize = sbp->sb_logsectsize;
else
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 708feb8eac76..c795ae47b3c9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -176,6 +176,9 @@ struct xfs_ino_geometry {
unsigned int ialloc_align;
unsigned int agino_log; /* #bits for agino in inum */
+
+ /* precomputed value for di_flags2 */
+ uint64_t new_diflags2;
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index b7e222befb08..90f1d5645052 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -132,6 +132,17 @@ xfs_trans_log_inode(
}
/*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
* Record the specific change for fdatasync optimisation. This allows
* fdatasync to skip log forces for inodes that are only timestamp
* dirty.
@@ -177,9 +188,9 @@ xfs_trans_log_inode(
/*
* Always OR in the bits from the ili_last_fields field. This is to
- * coordinate with the xfs_iflush() and xfs_iflush_done() routines in
- * the eventual clearing of the ili_fields bits. See the big comment in
- * xfs_iflush() for an explanation of this coordination mechanism.
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
*/
iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
spin_unlock(&iip->ili_lock);
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index e9bcf1faa183..ae8e2e0ac64a 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -781,6 +781,35 @@ xchk_agi_xref_icounts(
xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
}
+/* Check agi_[fi]blocks against tree size */
+static inline void
+xchk_agi_xref_fiblocks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agblock_t blocks;
+ int error = 0;
+
+ if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb))
+ return;
+
+ if (sc->sa.ino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_iblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.fino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_fblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+}
+
/* Cross-reference with the other btrees. */
STATIC void
xchk_agi_xref(
@@ -804,6 +833,7 @@ xchk_agi_xref(
xchk_agi_xref_icounts(sc);
xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_agi_xref_fiblocks(sc);
/* scrub teardown will take care of sc->sa for us */
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index bca2ab1d4be9..401f71579ce6 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees(
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ agi->agi_iblocks = cpu_to_be32(blocks);
+ }
xfs_btree_del_cursor(cur, error);
agi->agi_count = cpu_to_be32(count);
agi->agi_freecount = cpu_to_be32(freecount);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
+ XFS_BTNUM_FINO);
+ if (error)
+ goto err;
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, error);
+ agi->agi_fblocks = cpu_to_be32(blocks);
+ }
+
return 0;
err:
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index e56786f0a13c..653f3280e1c1 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -441,6 +441,20 @@ xchk_da_btree_block(
goto out_freebp;
}
+ /*
+ * If we've been handed a block that is below the dabtree root, does
+ * its hashval match what the parent block expected to see?
+ */
+ if (level > 0) {
+ struct xfs_da_node_entry *key;
+
+ key = xchk_da_btree_node_entry(ds, level - 1);
+ if (be32_to_cpu(key->hashval) != blk->hashval) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
out:
return error;
out_freebp:
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6d483ab29e63..3aa85b64de36 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -190,11 +190,30 @@ xchk_inode_flags2(
if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
goto bad;
+ /* no bigtime iflag without the bigtime feature */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
}
+static inline void
+xchk_dinode_nsec(
+ struct xfs_scrub *sc,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+
+ tv = xfs_inode_from_disk_ts(dip, ts);
+ if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
+ xchk_ino_set_corrupt(sc, ino);
+}
+
/* Scrub all the ondisk inode fields. */
STATIC void
xchk_dinode(
@@ -293,12 +312,9 @@ xchk_dinode(
}
/* di_[amc]time.nsec */
- if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
/*
* di_size. xfs_dinode_verify checks for things that screw up
@@ -403,8 +419,7 @@ xchk_dinode(
}
if (dip->di_version >= 3) {
- if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
xchk_inode_cowextsize(sc, dip, ino, mode, flags,
flags2);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 5641ae512c9e..c08be5ede066 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
struct xfs_inode *ip)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index d4c687b5cd06..c544951a0c07 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (acl) {
args.valuelen = XFS_ACL_SIZE(acl->a_count);
- args.value = kmem_zalloc_large(args.valuelen, 0);
+ args.value = kvzalloc(args.valuelen, GFP_KERNEL);
if (!args.value)
return -ENOMEM;
xfs_acl_to_disk(args.value, acl);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b35611882ff9..55d126d4e096 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -544,7 +544,7 @@ xfs_discard_page(
page, ip->i_ino, offset);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- PAGE_SIZE / i_blocksize(inode));
+ i_blocks_per_page(inode, page));
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 50f922cad91a..8f8837fe21cf 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -61,7 +61,7 @@ xfs_attr_shortform_list(
int error = 0;
ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
@@ -96,7 +96,7 @@ xfs_attr_shortform_list(
*/
if (context->seen_enough)
break;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
trace_xfs_attr_list_sf_all(context);
return 0;
@@ -136,7 +136,7 @@ xfs_attr_shortform_list(
/* These are bytes, and both on-disk, don't endian-flip */
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ec3691372e7c..9e16a4d0f97c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -24,6 +24,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
@@ -423,30 +424,26 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
STATIC int
xfs_bui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_bmbt_irec irec;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
xfs_filblks_t count;
xfs_exntst_t state;
- enum xfs_bmap_intent_type type;
- bool op_ok;
unsigned int bui_type;
int whichfork;
int error = 0;
/* Only one mapping operation per BUI... */
- if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- xfs_bui_release(buip);
+ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
return -EFSCORRUPTED;
- }
/*
* First check the validity of the extent described by the
@@ -457,76 +454,58 @@ xfs_bui_item_recover(
XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
XFS_INO_TO_FSB(mp, bmap->me_owner)));
- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ switch (bui_type) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- op_ok = true;
break;
default:
- op_ok = false;
- break;
+ return -EFSCORRUPTED;
}
- if (!op_ok || startblock_fsb == 0 ||
+ if (startblock_fsb == 0 ||
bmap->me_len == 0 ||
inode_fsb == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
bmap->me_len >= mp->m_sb.sb_agblocks ||
inode_fsb >= mp->m_sb.sb_dblocks ||
- (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the BUI from the AIL and
- * free the memory associated with it.
- */
- xfs_bui_release(buip);
+ (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ /* Grab the inode. */
+ error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
- budp = xfs_trans_get_bud(tp, buip);
- /* Grab the inode. */
- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+ error = xfs_qm_dqattach(ip);
if (error)
- goto err_inode;
+ goto err_rele;
if (VFS_I(ip)->i_nlink == 0)
xfs_iflags_set(ip, XFS_IRECOVERY);
- /* Process deferred bmap item. */
- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- switch (bui_type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- type = bui_type;
- break;
- default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto err_inode;
- }
+ /* Allocate transaction and do the work. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ budp = xfs_trans_get_bud(tp, buip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
- bmap->me_startoff, bmap->me_startblock, &count, state);
+ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
if (error)
- goto err_inode;
+ goto err_cancel;
if (count > 0) {
- ASSERT(type == XFS_BMAP_UNMAP);
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
@@ -534,20 +513,24 @@ xfs_bui_item_recover(
xfs_bmap_unmap_extent(tp, ip, &irec);
}
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ /*
+ * Commit transaction, which frees the transaction and saves the inode
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ if (error)
+ goto err_unlock;
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
+ return 0;
- return error;
-
-err_inode:
- xfs_defer_move(parent_tp, tp);
+err_cancel:
xfs_trans_cancel(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+err_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+ xfs_irele(ip);
return error;
}
@@ -559,6 +542,32 @@ xfs_bui_item_match(
return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_bud_log_item *budp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = BUI_ITEM(intent)->bui_format.bui_nextents;
+ extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ atomic_set(&buip->bui_next_extent, count);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ return &buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
@@ -566,6 +575,7 @@ static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_release = xfs_bui_item_release,
.iop_recover = xfs_bui_item_recover,
.iop_match = xfs_bui_item_match,
+ .iop_relog = xfs_bui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5123f82f2477..f2a8a0e75e1f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -946,6 +946,14 @@ xfs_free_file_space(
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+ /* We can only free complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+
+ if ((startoffset_fsb | endoffset_fsb) & (extsz - 1))
+ return -EINVAL;
+ }
+
/*
* Need to zero the stuff we're not freeing, on disk.
*/
@@ -1139,6 +1147,14 @@ xfs_insert_file_space(
trace_xfs_insert_file_space(ip);
+ /* We can only insert complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+
+ if ((stop_fsb | shift_fsb) & (extsz - 1))
+ return -EINVAL;
+ }
+
error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d4cdcb6fb2fe..4e4cf91f4f9f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone;
* b_lock (trylock due to inversion)
*/
+static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
+
+static inline int
+xfs_buf_submit(
+ struct xfs_buf *bp)
+{
+ return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -751,7 +760,7 @@ found:
return 0;
}
-STATIC int
+int
_xfs_buf_read(
xfs_buf_t *bp,
xfs_buf_flags_t flags)
@@ -759,7 +768,7 @@ _xfs_buf_read(
ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
return xfs_buf_submit(bp);
@@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin(
set_current_state(TASK_RUNNING);
}
+static void
+xfs_buf_ioerror_alert_ratelimited(
+ struct xfs_buf *bp)
+{
+ static unsigned long lasttime;
+ static struct xfs_buftarg *lasttarg;
+
+ if (bp->b_target != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_buf_ioerror_alert(bp, __this_address);
+ }
+ lasttarg = bp->b_target;
+}
+
/*
- * Buffer Utility Routines
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
*/
+static bool
+xfs_buf_ioerror_permanent(
+ struct xfs_buf *bp,
+ struct xfs_error_cfg *cfg)
+{
+ struct xfs_mount *mp = bp->b_mount;
-void
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ return true;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ return true;
+
+ /* At unmount we may treat errors differently */
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ return true;
+
+ return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error. If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Returns true if this function took care of error handling and the caller must
+ * not touch the buffer again. Return false if the caller should proceed with
+ * normal I/O completion handling.
+ */
+static bool
+xfs_buf_ioend_handle_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_error_cfg *cfg;
+
+ /*
+ * If we've already decided to shutdown the filesystem because of I/O
+ * errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_stale;
+
+ xfs_buf_ioerror_alert_ratelimited(bp);
+
+ /*
+ * We're not going to bother about retrying this during recovery.
+ * One strike!
+ */
+ if (bp->b_flags & _XBF_LOGRECOVERY) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+ }
+
+ /*
+ * Synchronous writes will have callers process the error.
+ */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_iodone_async(bp, _RET_IP_);
+
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+ if (bp->b_last_error != bp->b_error ||
+ !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
+ bp->b_last_error = bp->b_error;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
+ goto resubmit;
+ }
+
+ /*
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
+ */
+ if (xfs_buf_ioerror_permanent(bp, cfg)) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out_stale;
+ }
+
+ /* Still considered a transient error. Caller will schedule retries. */
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_io_fail(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_io_fail(bp);
+ else
+ ASSERT(list_empty(&bp->b_li_list));
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
+
+resubmit:
+ xfs_buf_ioerror(bp, 0);
+ bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ xfs_buf_submit(bp);
+ return true;
+out_stale:
+ xfs_buf_stale(bp);
+ bp->b_flags |= XBF_DONE;
+ bp->b_flags &= ~XBF_WRITE;
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+static void
xfs_buf_ioend(
struct xfs_buf *bp)
{
- bool read = bp->b_flags & XBF_READ;
-
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-
/*
* Pull in IO completion errors now. We are guaranteed to be running
* single threaded, so we don't need the lock to read b_io_error.
@@ -1191,39 +1325,47 @@ xfs_buf_ioend(
if (!bp->b_error && bp->b_io_error)
xfs_buf_ioerror(bp, bp->b_io_error);
- if (read) {
+ if (bp->b_flags & XBF_READ) {
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
bp->b_flags |= XBF_DONE;
- xfs_buf_ioend_finish(bp);
- return;
- }
+ } else {
+ if (!bp->b_error) {
+ bp->b_flags &= ~XBF_WRITE_FAIL;
+ bp->b_flags |= XBF_DONE;
+ }
- if (!bp->b_error) {
- bp->b_flags &= ~XBF_WRITE_FAIL;
- bp->b_flags |= XBF_DONE;
- }
+ if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
+ return;
- /*
- * If this is a log recovery buffer, we aren't doing transactional IO
- * yet so we need to let it handle IO completions.
- */
- if (bp->b_flags & _XBF_LOGRECOVERY) {
- xlog_recover_iodone(bp);
- return;
- }
+ /* clear the retry state */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
+ bp->b_first_retry_time = 0;
- if (bp->b_flags & _XBF_INODES) {
- xfs_buf_inode_iodone(bp);
- return;
- }
+ /*
+ * Note that for things like remote attribute buffers, there may
+ * not be a buffer log item here, so processing the buffer log
+ * item must remain optional.
+ */
+ if (bp->b_log_item)
+ xfs_buf_item_done(bp);
+
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_iodone(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_iodone(bp);
- if (bp->b_flags & _XBF_DQUOTS) {
- xfs_buf_dquot_iodone(bp);
- return;
}
- xfs_buf_iodone(bp);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
+ _XBF_LOGRECOVERY);
+
+ if (bp->b_flags & XBF_ASYNC)
+ xfs_buf_relse(bp);
+ else
+ complete(&bp->b_iowait);
}
static void
@@ -1506,7 +1648,7 @@ xfs_buf_iowait(
* safe to reference the buffer after a call to this function unless the caller
* holds an additional reference itself.
*/
-int
+static int
__xfs_buf_submit(
struct xfs_buf *bp,
bool wait)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 755b652e695a..bfd2907e7bc4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, int flags, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
+int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
@@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(struct xfs_buf *bp);
-static inline void xfs_buf_ioend_finish(struct xfs_buf *bp)
-{
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_relse(bp);
- else
- complete(&bp->b_iowait);
-}
extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
-
-extern int __xfs_buf_submit(struct xfs_buf *bp, bool);
-static inline int xfs_buf_submit(struct xfs_buf *bp)
-{
- bool wait = bp->b_flags & XBF_ASYNC ? false : true;
- return __xfs_buf_submit(bp, wait);
-}
-
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 408d1b572d3f..0356f2e340a1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-static void xfs_buf_item_done(struct xfs_buf *bp);
-
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
@@ -463,7 +461,7 @@ xfs_buf_item_unpin(
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
xfs_buf_item_done(bp);
- xfs_iflush_done(bp);
+ xfs_buf_inode_iodone(bp);
ASSERT(list_empty(&bp->b_li_list));
} else {
xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
@@ -956,153 +954,10 @@ xfs_buf_item_relse(
xfs_buf_item_free(bip);
}
-/*
- * Decide if we're going to retry the write after a failure, and prepare
- * the buffer for retrying the write.
- */
-static bool
-xfs_buf_ioerror_fail_without_retry(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_mount;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
-
- /*
- * If we've already decided to shutdown the filesystem because of
- * I/O errors, there's no point in giving this a retry.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- return true;
-
- if (bp->b_target != lasttarg ||
- time_after(jiffies, (lasttime + 5*HZ))) {
- lasttime = jiffies;
- xfs_buf_ioerror_alert(bp, __this_address);
- }
- lasttarg = bp->b_target;
-
- /* synchronous writes will have callers process the error */
- if (!(bp->b_flags & XBF_ASYNC))
- return true;
- return false;
-}
-
-static bool
-xfs_buf_ioerror_retry(
- struct xfs_buf *bp,
- struct xfs_error_cfg *cfg)
-{
- if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) &&
- bp->b_last_error == bp->b_error)
- return false;
-
- bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
- bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- !bp->b_first_retry_time)
- bp->b_first_retry_time = jiffies;
- return true;
-}
-
-/*
- * Account for this latest trip around the retry handler, and decide if
- * we've failed enough times to constitute a permanent failure.
- */
-static bool
-xfs_buf_ioerror_permanent(
- struct xfs_buf *bp,
- struct xfs_error_cfg *cfg)
-{
- struct xfs_mount *mp = bp->b_mount;
-
- if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
- ++bp->b_retries > cfg->max_retries)
- return true;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
- return true;
-
- /* At unmount we may treat errors differently */
- if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
- return true;
-
- return false;
-}
-
-/*
- * On a sync write or shutdown we just want to stale the buffer and let the
- * caller handle the error in bp->b_error appropriately.
- *
- * If the write was asynchronous then no one will be looking for the error. If
- * this is the first failure of this type, clear the error state and write the
- * buffer out again. This means we always retry an async write failure at least
- * once, but we also need to set the buffer up to behave correctly now for
- * repeated failures.
- *
- * If we get repeated async write failures, then we take action according to the
- * error configuration we have been set up to use.
- *
- * Multi-state return value:
- *
- * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions
- * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions
- * XBF_IOERROR_FAIL: transient error, run failure callback completions and then
- * release the buffer
- */
-enum {
- XBF_IOERROR_FINISH,
- XBF_IOERROR_DONE,
- XBF_IOERROR_FAIL,
-};
-
-static int
-xfs_buf_iodone_error(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_mount;
- struct xfs_error_cfg *cfg;
-
- if (xfs_buf_ioerror_fail_without_retry(bp))
- goto out_stale;
-
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-
- cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
- if (xfs_buf_ioerror_retry(bp, cfg)) {
- xfs_buf_ioerror(bp, 0);
- xfs_buf_submit(bp);
- return XBF_IOERROR_DONE;
- }
-
- /*
- * Permanent error - we need to trigger a shutdown if we haven't already
- * to indicate that inconsistency will result from this action.
- */
- if (xfs_buf_ioerror_permanent(bp, cfg)) {
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- goto out_stale;
- }
-
- /* Still considered a transient error. Caller will schedule retries. */
- return XBF_IOERROR_FAIL;
-
-out_stale:
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_error_relse(bp, _RET_IP_);
- return XBF_IOERROR_FINISH;
-}
-
-static void
+void
xfs_buf_item_done(
struct xfs_buf *bp)
{
- struct xfs_buf_log_item *bip = bp->b_log_item;
-
- if (!bip)
- return;
-
/*
* If we are forcibly shutting down, this may well be off the AIL
* already. That's because we simulate the log-committed callbacks to
@@ -1111,113 +966,12 @@ xfs_buf_item_done(
* xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
+ *
+ * Note that log recovery writes might have buffer items that are not on
+ * the AIL even when the file system is not shut down.
*/
- xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE);
- bp->b_log_item = NULL;
- xfs_buf_item_free(bip);
- xfs_buf_rele(bp);
-}
-
-static inline void
-xfs_buf_clear_ioerror_retry_state(
- struct xfs_buf *bp)
-{
- bp->b_last_error = 0;
- bp->b_retries = 0;
- bp->b_first_retry_time = 0;
-}
-
-/*
- * Inode buffer iodone callback function.
- */
-void
-xfs_buf_inode_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- struct xfs_log_item *lip;
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- set_bit(XFS_LI_FAILED, &lip->li_flags);
- }
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- xfs_buf_item_done(bp);
- xfs_iflush_done(bp);
- xfs_buf_ioend_finish(bp);
-}
-
-/*
- * Dquot buffer iodone callback function.
- */
-void
-xfs_buf_dquot_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- struct xfs_log_item *lip;
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- spin_lock(&bp->b_mount->m_ail->ail_lock);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- xfs_set_li_failed(lip, bp);
- }
- spin_unlock(&bp->b_mount->m_ail->ail_lock);
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- /* a newly allocated dquot buffer might have a log item attached */
- xfs_buf_item_done(bp);
- xfs_dquot_done(bp);
- xfs_buf_ioend_finish(bp);
-}
-
-/*
- * Dirty buffer iodone callback function.
- *
- * Note that for things like remote attribute buffers, there may not be a buffer
- * log item here, so processing the buffer log item must remain be optional.
- */
-void
-xfs_buf_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- ASSERT(list_empty(&bp->b_li_list));
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- xfs_buf_item_done(bp);
- xfs_buf_ioend_finish(bp);
+ xfs_trans_ail_delete(&bp->b_log_item->bli_item,
+ (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
+ SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_relse(bp);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 23507cbb4c41..50aa0f5ef959 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -50,12 +50,24 @@ struct xfs_buf_log_item {
};
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void xfs_buf_item_done(struct xfs_buf *bp);
void xfs_buf_item_relse(struct xfs_buf *);
bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
+void xfs_buf_inode_io_fail(struct xfs_buf *bp);
+#ifdef CONFIG_XFS_QUOTA
void xfs_buf_dquot_iodone(struct xfs_buf *);
+void xfs_buf_dquot_io_fail(struct xfs_buf *bp);
+#else
+static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
+{
+}
+static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
+{
+}
+#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 8f0457d67d77..d44e8b4a3391 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -414,7 +414,7 @@ xlog_recover_validate_buf_type(
*
* Write verifiers update the metadata LSN from log items attached to
* the buffer. Therefore, initialize a bli purely to carry the LSN to
- * the verifier. We'll clean it up in our ->iodone() callback.
+ * the verifier.
*/
if (bp->b_ops) {
struct xfs_buf_log_item *bip;
@@ -719,6 +719,8 @@ xlog_recover_get_buf_lsn(
case XFS_ABTC_MAGIC:
case XFS_RMAP_CRC_MAGIC:
case XFS_REFC_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
case XFS_IBT_CRC_MAGIC:
case XFS_IBT_MAGIC: {
struct xfs_btree_block *btb = blk;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bcd73b9c2994..1d95ed387d66 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits(
xfs_dquot_set_prealloc_limits(dq);
}
+/* Set the expiration time of a quota's grace period. */
+time64_t
+xfs_dquot_set_timeout(
+ struct xfs_mount *mp,
+ time64_t timeout)
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+
+ return clamp_t(time64_t, timeout, qi->qi_expiry_min,
+ qi->qi_expiry_max);
+}
+
+/* Set the length of the default grace period. */
+time64_t
+xfs_dquot_set_grace_period(
+ time64_t grace)
+{
+ return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX);
+}
+
/*
* Determine if this quota counter is over either limit and set the quota
* timers as appropriate.
*/
static inline void
xfs_qm_adjust_res_timer(
+ struct xfs_mount *mp,
struct xfs_dquot_res *res,
struct xfs_quota_limits *qlim)
{
@@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer(
if ((res->softlimit && res->count > res->softlimit) ||
(res->hardlimit && res->count > res->hardlimit)) {
if (res->timer == 0)
- res->timer = ktime_get_real_seconds() + qlim->time;
+ res->timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + qlim->time);
} else {
if (res->timer == 0)
res->warnings = 0;
@@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers(
ASSERT(dq->q_id);
defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
- xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk);
- xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino);
- xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb);
}
/*
@@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_type = type;
+ if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb))
+ d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
@@ -514,9 +538,9 @@ xfs_dquot_from_disk(
dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
- dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer);
- dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer);
- dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer);
+ dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
+ dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
+ dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
/*
* Reservation counters are defined as reservation plus current usage
@@ -559,9 +583,9 @@ xfs_dquot_to_disk(
ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
- ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer);
- ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer);
- ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer);
+ ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
+ ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
+ ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
}
/* Allocate and initialize the dquot buffer for this in-core dquot. */
@@ -807,8 +831,8 @@ xfs_qm_dqget_checks(
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked
- * dquot, doing an allocation (if requested) as needed.
+ * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
+ * locked dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
@@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done(
}
void
-xfs_dquot_done(
+xfs_buf_dquot_iodone(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
@@ -1118,6 +1142,18 @@ xfs_dquot_done(
}
}
+void
+xfs_buf_dquot_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ spin_lock(&bp->b_mount->m_ail->ail_lock);
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ xfs_set_li_failed(lip, bp);
+ spin_unlock(&bp->b_mount->m_ail->ail_lock);
+}
+
/* Check incore dquot for errors before we flush. */
static xfs_failaddr_t
xfs_qm_dqflush_check(
@@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check(
!dqp->q_rtb.timer)
return __this_address;
+ /* bigtime flag should never be set on root dquots */
+ if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
+ if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb))
+ return __this_address;
+ if (dqp->q_id == 0)
+ return __this_address;
+ }
+
return NULL;
}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 282a65da93c7..f642884a6834 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn, void *priv);
+time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
+time64_t xfs_dquot_set_grace_period(time64_t grace);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6cb8cd11072a..6c11bfc3d452 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
STATIC int
xfs_efi_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
@@ -608,14 +608,8 @@ xfs_efi_item_recover(
if (startblock_fsb == 0 ||
extp->ext_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
- extp->ext_len >= mp->m_sb.sb_agblocks) {
- /*
- * This will pull the EFI from the AIL and
- * free the memory associated with it.
- */
- xfs_efi_release(efip);
+ extp->ext_len >= mp->m_sb.sb_agblocks)
return -EFSCORRUPTED;
- }
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -633,8 +627,7 @@ xfs_efi_item_recover(
}
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_trans_cancel(tp);
@@ -649,6 +642,34 @@ xfs_efi_item_match(
return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_extent *extp;
+ unsigned int count;
+
+ count = EFI_ITEM(intent)->efi_format.efi_nextents;
+ extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ efdp->efd_next_extent = count;
+ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+ atomic_set(&efip->efi_next_extent, count);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ return &efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
@@ -656,6 +677,7 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_release = xfs_efi_item_release,
.iop_recover = xfs_efi_item_recover,
.iop_match = xfs_efi_item_match,
+ .iop_relog = xfs_efi_item_relog,
};
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a29f78a663ca..3d1b95124744 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1008,6 +1008,21 @@ xfs_file_fadvise(
return ret;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
@@ -1065,7 +1080,7 @@ xfs_file_remap_range(
if (ret)
goto out_unlock;
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
xfs_log_force_inode(dest);
out_unlock:
xfs_iunlock2_io_mmap(src, dest);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 1a88025e68a3..db23e455eb91 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -33,39 +33,7 @@ enum xfs_fstrm_alloc {
/*
* Allocation group filestream associations are tracked with per-ag atomic
* counters. These counters allow xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation. When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all. The race condition this addresses is the following:
- *
- * - The work queue scheduler fires and pulls a filestream directory cache
- * element off the LRU end of the cache for deletion, then gets pre-empted.
- * - A growfs operation grabs the m_peraglock in write mode, flushes all the
- * remaining items from the cache and reallocates the mount point's per-ag
- * array, resetting all the counters to zero.
- * - The work queue thread resumes and calls the free function for the element
- * it started cleaning up earlier. In the process it decrements the
- * filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
+ * particular AG already has active filestreams associated with it.
*/
int
xfs_filestream_peek_ag(
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 4eebcec4aae6..9ce5e7d5bf8f 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -26,7 +26,7 @@
#include "xfs_rtalloc.h"
/* Convert an xfs_fsmap to an fsmap. */
-void
+static void
xfs_fsmap_from_internal(
struct fsmap *dest,
struct xfs_fsmap *src)
@@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap(
/* getfsmap query state */
struct xfs_getfsmap_info {
struct xfs_fsmap_head *head;
- xfs_fsmap_format_t formatter; /* formatting fn */
- void *format_arg; /* format buffer */
+ struct fsmap *fsmap_recs; /* mapping records */
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
xfs_daddr_t next_daddr; /* next daddr we expect */
u64 missing_owner; /* owner of holes */
@@ -224,6 +223,20 @@ xfs_getfsmap_is_shared(
return 0;
}
+static inline void
+xfs_getfsmap_format(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *xfm,
+ struct xfs_getfsmap_info *info)
+{
+ struct fsmap *rec;
+
+ trace_xfs_getfsmap_mapping(mp, xfm);
+
+ rec = &info->fsmap_recs[info->head->fmh_entries++];
+ xfs_fsmap_from_internal(rec, xfm);
+}
+
/*
* Format a reverse mapping for getfsmap, having translated rm_startblock
* into the appropriate daddr units.
@@ -256,6 +269,9 @@ xfs_getfsmap_helper(
/* Are we just counting mappings? */
if (info->head->fmh_count == 0) {
+ if (info->head->fmh_entries == UINT_MAX)
+ return -ECANCELED;
+
if (rec_daddr > info->next_daddr)
info->head->fmh_entries++;
@@ -285,10 +301,7 @@ xfs_getfsmap_helper(
fmr.fmr_offset = 0;
fmr.fmr_length = rec_daddr - info->next_daddr;
fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
}
if (info->last)
@@ -320,11 +333,8 @@ xfs_getfsmap_helper(
if (shared)
fmr.fmr_flags |= FMR_OF_SHARED;
}
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
out:
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
@@ -792,11 +802,11 @@ xfs_getfsmap_check_keys(
#endif /* CONFIG_XFS_RT */
/*
- * Get filesystem's extents as described in head, and format for
- * output. Calls formatter to fill the user's buffer until all
- * extents are mapped, until the passed-in head->fmh_count slots have
- * been filled, or until the formatter short-circuits the loop, if it
- * is tracking filled-in extents on its own.
+ * Get filesystem's extents as described in head, and format for output. Fills
+ * in the supplied records array until there are no more reverse mappings to
+ * return or head.fmh_entries == head.fmh_count. In the second case, this
+ * function returns -ECANCELED to indicate that more records would have been
+ * returned.
*
* Key to Confusion
* ----------------
@@ -816,8 +826,7 @@ int
xfs_getfsmap(
struct xfs_mount *mp,
struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter,
- void *arg)
+ struct fsmap *fsmap_recs)
{
struct xfs_trans *tp = NULL;
struct xfs_fsmap dkeys[2]; /* per-dev keys */
@@ -892,8 +901,7 @@ xfs_getfsmap(
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
- info.formatter = formatter;
- info.format_arg = arg;
+ info.fsmap_recs = fsmap_recs;
info.head = head;
/*
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
index c6c57739b862..a0775788e7b1 100644
--- a/fs/xfs/xfs_fsmap.h
+++ b/fs/xfs/xfs_fsmap.h
@@ -27,13 +27,9 @@ struct xfs_fsmap_head {
struct xfs_fsmap fmh_keys[2]; /* low and high keys */
};
-void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
-/* fsmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
-
int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter, void *arg);
+ struct fsmap *out_recs);
#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 101028ebb571..deb99300d171 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -52,7 +52,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
/* initialise the xfs inode */
@@ -123,7 +122,7 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
- ASSERT(!xfs_isiflocked(ip));
+ ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -1035,23 +1034,21 @@ xfs_reclaim_inode(
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
goto out;
- if (!xfs_iflock_nowait(ip))
+ if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip);
goto reclaim;
}
if (xfs_ipincount(ip))
- goto out_ifunlock;
+ goto out_clear_flush;
if (!xfs_inode_clean(ip))
- goto out_ifunlock;
+ goto out_clear_flush;
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
reclaim:
- ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always appears
@@ -1101,8 +1098,8 @@ reclaim:
__xfs_inode_free(ip);
return;
-out_ifunlock:
- xfs_ifunlock(ip);
+out_clear_flush:
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
out_iunlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
@@ -1211,7 +1208,7 @@ xfs_reclaim_inodes(
while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
xfs_ail_push_all_sync(mp->m_ail);
xfs_reclaim_inodes_ag(mp, &nr_to_scan);
- };
+ }
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c06129cffba9..2bfbcf28b1bd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -598,22 +598,6 @@ xfs_lock_two_inodes(
}
}
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wq_entry);
-}
-
STATIC uint
_xfs_dic2xflags(
uint16_t di_flags,
@@ -714,6 +698,68 @@ out_unlock:
return error;
}
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ unsigned int di_flags = 0;
+ umode_t mode = VFS_I(ip)->i_mode;
+
+ if (S_ISDIR(mode)) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(mode)) {
+ if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
+ xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ }
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+
+ ip->i_d.di_flags |= di_flags;
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ }
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
+}
+
/*
* Allocate an inode on disk and return a copy of its in-core version.
* The in-core inode is locked exclusively. Set mode, nlink, and rdev
@@ -840,7 +886,7 @@ xfs_ialloc(
if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime = tv;
}
@@ -857,54 +903,10 @@ xfs_ialloc(
break;
case S_IFREG:
case S_IFDIR:
- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
-
- if (S_ISDIR(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- }
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_d.di_flags |= di_flags;
- }
- if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) {
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
- }
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
- }
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
+ xfs_inode_inherit_flags(ip, pip);
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
+ xfs_inode_inherit_flags2(ip, pip);
/* FALLTHROUGH */
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
@@ -1532,17 +1534,10 @@ xfs_itruncate_extents_flags(
if (error)
goto out;
- /*
- * Duplicate the transaction that has the permanent
- * reservation and commit the old transaction.
- */
+ /* free the just unmapped extents */
error = xfs_defer_finish(&tp);
if (error)
goto out;
-
- error = xfs_trans_roll_inode(&tp, ip);
- if (error)
- goto out;
}
if (whichfork == XFS_DATA_FORK) {
@@ -2531,11 +2526,8 @@ retry:
* valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- return;
- }
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+ goto out_iflags_unlock;
/*
* Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2552,16 +2544,14 @@ retry:
}
}
ip->i_flags |= XFS_ISTALE;
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
/*
- * If we can't get the flush lock, the inode is already attached. All
+ * If the inode is flushing, it is already attached to the buffer. All
* we needed to do here is mark the inode stale so buffer IO completion
* will remove it from the AIL.
*/
iip = ip->i_itemp;
- if (!xfs_iflock_nowait(ip)) {
+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
ASSERT(!list_empty(&iip->ili_item.li_bio_list));
ASSERT(iip->ili_last_fields);
goto out_iunlock;
@@ -2573,10 +2563,12 @@ retry:
* commit as the flock synchronises removal of the inode from the
* cluster buffer against inode reclaim.
*/
- if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
- xfs_ifunlock(ip);
+ if (!iip || list_empty(&iip->ili_item.li_bio_list))
goto out_iunlock;
- }
+
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
/* we have a dirty inode in memory that has not yet been flushed. */
spin_lock(&iip->ili_lock);
@@ -2586,9 +2578,16 @@ retry:
spin_unlock(&iip->ili_lock);
ASSERT(iip->ili_last_fields);
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return;
+
out_iunlock:
if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
}
/*
@@ -2631,8 +2630,9 @@ xfs_ifree_cluster(
/*
* We obtain and lock the backing buffer first in the process
- * here, as we have to ensure that any dirty inode that we
- * can't get the flush lock on is attached to the buffer.
+ * here to ensure dirty inodes attached to the buffer remain in
+ * the flushing state while we mark them stale.
+ *
* If we scan the in-memory inodes first, then buffer IO can
* complete before we get a lock on it, and hence we may fail
* to mark all the active inodes on the buffer stale.
@@ -2717,7 +2717,7 @@ xfs_ifree(
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
@@ -3443,7 +3443,7 @@ xfs_iflush(
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip->ili_item.li_buf == bp);
@@ -3553,8 +3553,8 @@ xfs_iflush(
*
* What we do is move the bits to the ili_last_fields field. When
* logging the inode, these bits are moved back to the ili_fields field.
- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
- * know that the information those bits represent is permanently on
+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
+ * we know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
*/
@@ -3568,7 +3568,7 @@ flush_out:
/*
* Store the current LSN of the inode so that we can tell whether the
- * item has moved in the AIL from xfs_iflush_done().
+ * item has moved in the AIL from xfs_buf_inode_iodone().
*/
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -3613,7 +3613,7 @@ xfs_iflush_cluster(
/*
* Quick and dirty check to avoid locks if possible.
*/
- if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
continue;
if (xfs_ipincount(ip))
continue;
@@ -3627,7 +3627,7 @@ xfs_iflush_cluster(
*/
spin_lock(&ip->i_flags_lock);
ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
- if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
@@ -3635,24 +3635,17 @@ xfs_iflush_cluster(
/*
* ILOCK will pin the inode against reclaim and prevent
* concurrent transactions modifying the inode while we are
- * flushing the inode.
+ * flushing the inode. If we get the lock, set the flushing
+ * state before we drop the i_flags_lock.
*/
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
spin_unlock(&ip->i_flags_lock);
/*
- * Skip inodes that are already flush locked as they have
- * already been written to the buffer.
- */
- if (!xfs_iflock_nowait(ip)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
* Abort flushing this inode if we are shut down because the
* inode may not currently be in the AIL. This can occur when
* log I/O failure unpins the inode without inserting into the
@@ -3661,7 +3654,6 @@ xfs_iflush_cluster(
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
error = -EIO;
@@ -3670,7 +3662,7 @@ xfs_iflush_cluster(
/* don't block waiting on a log force to unpin dirty inodes */
if (xfs_ipincount(ip)) {
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
continue;
}
@@ -3678,7 +3670,7 @@ xfs_iflush_cluster(
if (!xfs_inode_clean(ip))
error = xfs_iflush(ip, bp);
else
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
break;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e9a8bb184d1f..751a3d1d7d84 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
return ip->i_cowfp && ip->i_cowfp->if_bytes;
}
+static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+{
+ return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
-#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */
@@ -234,36 +238,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
- return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
- if (!xfs_iflock_nowait(ip))
- __xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
- ASSERT(xfs_isiflocked(ip));
- xfs_iflags_clear(ip, XFS_IFLOCK);
- smp_mb();
- wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
-
-/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6c65938cee1c..17e20a6d8b4e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork(
}
}
+/*
+ * Convert an incore timestamp to a log timestamp. Note that the log format
+ * specifies host endian format!
+ */
+static inline xfs_ictimestamp_t
+xfs_inode_to_log_dinode_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_ictimestamp_t its;
+
+ if (xfs_inode_has_bigtime(ip))
+ return xfs_inode_encode_bigtime(tv);
+
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lits->t_sec = tv.tv_sec;
+ lits->t_nsec = tv.tv_nsec;
+
+ return its;
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -313,12 +335,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime.t_sec = inode->i_atime.tv_sec;
- to->di_atime.t_nsec = inode->i_atime.tv_nsec;
- to->di_mtime.t_sec = inode->i_mtime.tv_sec;
- to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
- to->di_ctime.t_sec = inode->i_ctime.tv_sec;
- to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
@@ -340,8 +359,7 @@ xfs_inode_to_log_dinode(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
- to->di_crtime.t_sec = from->di_crtime.tv_sec;
- to->di_crtime.t_nsec = from->di_crtime.tv_nsec;
+ to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime);
to->di_flags2 = from->di_flags2;
to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
@@ -491,8 +509,7 @@ xfs_inode_item_push(
(ip->i_flags & XFS_ISTALE))
return XFS_ITEM_PINNED;
- /* If the inode is already flush locked, we're already flushing. */
- if (xfs_isiflocked(ip))
+ if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
if (!xfs_buf_trylock(bp))
@@ -703,7 +720,7 @@ xfs_iflush_finish(
iip->ili_last_fields = 0;
iip->ili_flush_lsn = 0;
spin_unlock(&iip->ili_lock);
- xfs_ifunlock(iip->ili_inode);
+ xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
if (drop_buffer)
xfs_buf_rele(bp);
}
@@ -711,11 +728,11 @@ xfs_iflush_finish(
/*
* Inode buffer IO completion routine. It is responsible for removing inodes
- * attached to the buffer from the AIL if they have not been re-logged, as well
- * as completing the flush and unlocking the inode.
+ * attached to the buffer from the AIL if they have not been re-logged and
+ * completing the inode flush.
*/
void
-xfs_iflush_done(
+xfs_buf_inode_iodone(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
@@ -754,11 +771,21 @@ xfs_iflush_done(
list_splice_tail(&flushed_inodes, &bp->b_li_list);
}
+void
+xfs_buf_inode_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
+}
+
/*
- * This is the inode flushing abort routine. It is called from xfs_iflush when
+ * This is the inode flushing abort routine. It is called when
* the filesystem is shutting down to clean up the inode state. It is
* responsible for removing the inode item from the AIL if it has not been
- * re-logged, and unlocking the inode's flush lock.
+ * re-logged and clearing the inode's flush state.
*/
void
xfs_iflush_abort(
@@ -790,7 +817,7 @@ xfs_iflush_abort(
list_del_init(&iip->ili_item.li_bio_list);
spin_unlock(&iip->ili_lock);
}
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
if (bp)
xfs_buf_rele(bp);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 048b5e7dee90..4b926e32831c 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -25,8 +25,8 @@ struct xfs_inode_log_item {
*
* We need atomic changes between inode dirtying, inode flushing and
* inode completion, but these all hold different combinations of
- * ILOCK and iflock and hence we need some other method of serialising
- * updates to the flush state.
+ * ILOCK and IFLUSHING and hence we need some other method of
+ * serialising updates to the flush state.
*/
spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
@@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip)
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 5e0d291835b3..cb44f7653f03 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -115,6 +115,82 @@ out_free_ip:
return error;
}
+static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
+}
+
+/* Convert a log timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_log_dinode_to_disk_ts(
+ struct xfs_log_dinode *from,
+ const xfs_ictimestamp_t its)
+{
+ struct xfs_legacy_timestamp *lts;
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_timestamp_t ts;
+
+ if (xfs_log_dinode_has_bigtime(from))
+ return cpu_to_be64(its);
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lts->t_sec = cpu_to_be32(lits->t_sec);
+ lts->t_nsec = cpu_to_be32(lits->t_nsec);
+
+ return ts;
+}
+
+STATIC void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = 0;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
+ to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
+ to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
+ to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime = xfs_log_dinode_to_disk_ts(from,
+ from->di_crtime);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
STATIC int
xlog_recover_inode_commit_pass2(
struct xlog *log,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6f22a66777cd..3fbd98f61ea5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -404,7 +404,7 @@ xfs_ioc_attr_list(
context.cursor.offset))
return -EINVAL;
- buffer = kmem_zalloc_large(bufsize, 0);
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
if (!buffer)
return -ENOMEM;
@@ -1190,7 +1190,8 @@ xfs_flags2diflags2(
unsigned int xflags)
{
uint64_t di_flags2 =
- (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK |
+ XFS_DIFLAG2_BIGTIME));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
@@ -1690,7 +1691,7 @@ xfs_ioc_getbmap(
if (bmx.bmv_count > ULONG_MAX / recsize)
return -ENOMEM;
- buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1715,39 +1716,17 @@ out_free_buf:
return error;
}
-struct getfsmap_info {
- struct xfs_mount *mp;
- struct fsmap_head __user *data;
- unsigned int idx;
- __u32 last_flags;
-};
-
-STATIC int
-xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
-{
- struct getfsmap_info *info = priv;
- struct fsmap fm;
-
- trace_xfs_getfsmap_mapping(info->mp, xfm);
-
- info->last_flags = xfm->fmr_flags;
- xfs_fsmap_from_internal(&fm, xfm);
- if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
- sizeof(struct fsmap)))
- return -EFAULT;
-
- return 0;
-}
-
STATIC int
xfs_ioc_getfsmap(
struct xfs_inode *ip,
struct fsmap_head __user *arg)
{
- struct getfsmap_info info = { NULL };
struct xfs_fsmap_head xhead = {0};
struct fsmap_head head;
- bool aborted = false;
+ struct fsmap *recs;
+ unsigned int count;
+ __u32 last_flags = 0;
+ bool done = false;
int error;
if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
@@ -1759,38 +1738,112 @@ xfs_ioc_getfsmap(
sizeof(head.fmh_keys[1].fmr_reserved)))
return -EINVAL;
+ /*
+ * Use an internal memory buffer so that we don't have to copy fsmap
+ * data to userspace while holding locks. Start by trying to allocate
+ * up to 128k for the buffer, but fall back to a single page if needed.
+ */
+ count = min_t(unsigned int, head.fmh_count,
+ 131072 / sizeof(struct fsmap));
+ recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs) {
+ count = min_t(unsigned int, head.fmh_count,
+ PAGE_SIZE / sizeof(struct fsmap));
+ recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs)
+ return -ENOMEM;
+ }
+
xhead.fmh_iflags = head.fmh_iflags;
- xhead.fmh_count = head.fmh_count;
xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
- info.mp = ip->i_mount;
- info.data = arg;
- error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
- if (error == -ECANCELED) {
- error = 0;
- aborted = true;
- } else if (error)
- return error;
+ head.fmh_entries = 0;
+ do {
+ struct fsmap __user *user_recs;
+ struct fsmap *last_rec;
+
+ user_recs = &arg->fmh_recs[head.fmh_entries];
+ xhead.fmh_entries = 0;
+ xhead.fmh_count = min_t(unsigned int, count,
+ head.fmh_count - head.fmh_entries);
+
+ /* Run query, record how many entries we got. */
+ error = xfs_getfsmap(ip->i_mount, &xhead, recs);
+ switch (error) {
+ case 0:
+ /*
+ * There are no more records in the result set. Copy
+ * whatever we got to userspace and break out.
+ */
+ done = true;
+ break;
+ case -ECANCELED:
+ /*
+ * The internal memory buffer is full. Copy whatever
+ * records we got to userspace and go again if we have
+ * not yet filled the userspace buffer.
+ */
+ error = 0;
+ break;
+ default:
+ goto out_free;
+ }
+ head.fmh_entries += xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
- /* If we didn't abort, set the "last" flag in the last fmx */
- if (!aborted && info.idx) {
- info.last_flags |= FMR_OF_LAST;
- if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
- &info.last_flags, sizeof(info.last_flags)))
- return -EFAULT;
+ /*
+ * If the caller wanted a record count or there aren't any
+ * new records to return, we're done.
+ */
+ if (head.fmh_count == 0 || xhead.fmh_entries == 0)
+ break;
+
+ /* Copy all the records we got out to userspace. */
+ if (copy_to_user(user_recs, recs,
+ xhead.fmh_entries * sizeof(struct fsmap))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ /* Remember the last record flags we copied to userspace. */
+ last_rec = &recs[xhead.fmh_entries - 1];
+ last_flags = last_rec->fmr_flags;
+
+ /* Set up the low key for the next iteration. */
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ } while (!done && head.fmh_entries < head.fmh_count);
+
+ /*
+ * If there are no more records in the query result set and we're not
+ * in counting mode, mark the last record returned with the LAST flag.
+ */
+ if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
+ struct fsmap __user *user_rec;
+
+ last_flags |= FMR_OF_LAST;
+ user_rec = &arg->fmh_recs[head.fmh_entries - 1];
+
+ if (copy_to_user(&user_rec->fmr_flags, &last_flags,
+ sizeof(last_flags))) {
+ error = -EFAULT;
+ goto out_free;
+ }
}
/* copy back header */
- head.fmh_entries = xhead.fmh_entries;
- head.fmh_oflags = xhead.fmh_oflags;
- if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
- return -EFAULT;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
- return 0;
+out_free:
+ kmem_free(recs);
+ return error;
}
STATIC int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 80a13c8561d8..5e165456da68 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -237,7 +237,7 @@ xfs_vn_create(
umode_t mode,
bool flags)
{
- return xfs_vn_mknod(dir, dentry, mode, 0);
+ return xfs_generic_create(dir, dentry, mode, 0, false);
}
STATIC int
@@ -246,7 +246,7 @@ xfs_vn_mkdir(
struct dentry *dentry,
umode_t mode)
{
- return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+ return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
}
STATIC struct dentry *
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ab737fed7b12..ad1009778d33 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -123,7 +123,6 @@ typedef __u32 xfs_nlink_t;
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ad0c69ee8947..fa2d05e65ff1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1475,14 +1475,14 @@ xlog_commit_record(
}
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1508,7 +1508,7 @@ xlog_grant_push_ail(
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1528,13 +1528,33 @@ xlog_grant_push_ail(
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
@@ -1604,9 +1624,7 @@ xlog_cksum(
int i;
int xheads;
- xheads = size / XLOG_HEADER_CYCLE_SIZE;
- if (size % XLOG_HEADER_CYCLE_SIZE)
- xheads++;
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 1412d6993f1e..58c3fcbec94a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -141,4 +141,6 @@ void xfs_log_quiesce(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e2ec91b2d0f4..a8289adc1b29 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -265,32 +265,6 @@ xlog_header_check_mount(
return 0;
}
-void
-xlog_recover_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- /*
- * We're not going to bother about retrying
- * this during recovery. One strike!
- */
- if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror_alert(bp, __this_address);
- xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
- }
- }
-
- /*
- * On v5 supers, a bli could be attached to update the metadata LSN.
- * Clean it up.
- */
- if (bp->b_log_item)
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
- bp->b_flags &= ~_XBF_LOGRECOVERY;
- xfs_buf_ioend_finish(bp);
-}
-
/*
* This routine finds (to an approximation) the first block in the physical
* log which contains the given cycle. It uses a binary search algorithm.
@@ -397,6 +371,19 @@ out:
return error;
}
+static inline int
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
+{
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rh->h_size);
+
+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
+ h_size > XLOG_HEADER_CYCLE_SIZE)
+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ }
+ return 1;
+}
+
/*
* Potentially backup over partial log record write.
*
@@ -489,15 +476,7 @@ xlog_find_verify_log_record(
* reset last_blk. Only when last_blk points in the middle of a log
* record do we update last_blk.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- uint h_size = be32_to_cpu(head->h_size);
-
- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- } else {
- xhdrs = 1;
- }
+ xhdrs = xlog_logrec_hblks(log, head);
if (*last_blk - i + extra_bblks !=
BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
@@ -1184,22 +1163,7 @@ xlog_check_unmount_rec(
* below. We won't want to clear the unmount record if there is one, so
* we pass the lsn of the unmount record rather than the block after it.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
-
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
-
+ hblks = xlog_logrec_hblks(log, rhead);
after_umount_blk = xlog_wrap_logbno(log,
rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
@@ -2097,7 +2061,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, 0);
+ ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -2470,44 +2434,66 @@ xlog_recover_process_data(
/* Take all the collected deferred ops and finish them in order. */
static int
xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- int64_t freeblks;
- uint resblks;
- int error;
+ struct xfs_inode *ip;
+ int error = 0;
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ struct xfs_trans_res resv;
+
+ /*
+ * Create a new transaction reservation from the captured
+ * information. Set logcount to 1 to force the new transaction
+ * to regrant every roll so that we can make forward progress
+ * in recovery no matter how full the log might be.
+ */
+ resv.tr_logres = dfc->dfc_logres;
+ resv.tr_logcount = 1;
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ /*
+ * Transfer to this new transaction all the dfops we captured
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_continue(dfc, tp, &ip);
+
+ error = xfs_trans_commit(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ }
+ if (error)
+ return error;
+ }
- return xfs_trans_commit(tp);
+ ASSERT(list_empty(capture_list));
+ return 0;
}
-/* Is this log item a deferred action intent? */
-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
-}
+ struct xfs_defer_capture *dfc;
+ struct xfs_defer_capture *next;
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_release(mp, dfc);
+ }
+}
/*
* When this is called, all of the log intent items which did not have
* corresponding log done items should be in the AIL. What we do now
@@ -2528,35 +2514,23 @@ STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(capture_list);
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
- int error;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- while (lip != NULL) {
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
/*
* We're done when we see something other than an intent.
* There should be no intents left in the AIL now.
@@ -2578,26 +2552,29 @@ xlog_recover_process_intents(
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the transaction in
- * this routine or else those subsequent intents will get
+ * deferred ops, you /must/ attach them to the capture list in
+ * the recover routine or else those subsequent intents will be
* replayed in the wrong order!
*/
- if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) {
- spin_unlock(&ailp->ail_lock);
- error = lip->li_ops->iop_recover(lip, parent_tp);
- spin_lock(&ailp->ail_lock);
- }
+ spin_unlock(&ailp->ail_lock);
+ error = lip->li_ops->iop_recover(lip, &capture_list);
+ spin_lock(&ailp->ail_lock);
if (error)
- goto out;
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ break;
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ if (error)
+ goto err;
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
return error;
}
@@ -2904,7 +2881,8 @@ STATIC int
xlog_valid_rec_header(
struct xlog *log,
struct xlog_rec_header *rhead,
- xfs_daddr_t blkno)
+ xfs_daddr_t blkno,
+ int bufsize)
{
int hlen;
@@ -2920,10 +2898,14 @@ xlog_valid_rec_header(
return -EFSCORRUPTED;
}
- /* LR body must have data or it wouldn't have been written */
+ /*
+ * LR body must have data (or it wouldn't have been written)
+ * and h_len must not be greater than LR buffer size.
+ */
hlen = be32_to_cpu(rhead->h_len);
- if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
return -EFSCORRUPTED;
+
if (XFS_IS_CORRUPT(log->l_mp,
blkno > log->l_logBBsize || blkno > INT_MAX))
return -EFSCORRUPTED;
@@ -2984,9 +2966,6 @@ xlog_do_recovery_pass(
goto bread_err1;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, tail_blk);
- if (error)
- goto bread_err1;
/*
* xfsprogs has a bug where record length is based on lsunit but
@@ -3001,30 +2980,22 @@ xlog_do_recovery_pass(
*/
h_size = be32_to_cpu(rhead->h_size);
h_len = be32_to_cpu(rhead->h_len);
- if (h_len > h_size) {
- if (h_len <= log->l_mp->m_logbsize &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- xfs_warn(log->l_mp,
+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
+ rhead->h_num_logops == cpu_to_be32(1)) {
+ xfs_warn(log->l_mp,
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
- h_size, log->l_mp->m_logbsize);
- h_size = log->l_mp->m_logbsize;
- } else {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
- log->l_mp);
- error = -EFSCORRUPTED;
- goto bread_err1;
- }
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
}
- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
+ if (error)
+ goto bread_err1;
+
+ hblks = xlog_logrec_hblks(log, rhead);
+ if (hblks != 1) {
kmem_free(hbp);
hbp = xlog_alloc_buffer(log, hblks);
- } else {
- hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
@@ -3096,7 +3067,7 @@ xlog_do_recovery_pass(
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
- split_hblks ? blk_no : 0);
+ split_hblks ? blk_no : 0, h_size);
if (error)
goto bread_err2;
@@ -3177,7 +3148,7 @@ xlog_do_recovery_pass(
goto bread_err2;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
if (error)
goto bread_err2;
@@ -3294,14 +3265,14 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- struct xlog *log,
- xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
- xfs_buf_t *bp;
- xfs_sb_t *sbp;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp = mp->m_sb_bp;
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -3315,9 +3286,8 @@ xlog_do_recover(
/*
* If IO errors happened during recovery, bail out.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- }
/*
* We now update the tail_lsn since much of the recovery has completed
@@ -3331,16 +3301,12 @@ xlog_do_recover(
xlog_assign_tail_lsn(mp);
/*
- * Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock and reverify it.
+ * Now that we've finished replaying all buffer and inode updates,
+ * re-read the superblock and reverify it.
*/
- bp = xfs_getsb(mp);
- bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
- ASSERT(!(bp->b_flags & XBF_WRITE));
- bp->b_flags |= XBF_READ;
- bp->b_ops = &xfs_sb_buf_ops;
-
- error = xfs_buf_submit(bp);
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ error = _xfs_buf_read(bp, XBF_READ);
if (error) {
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_ioerror_alert(bp, __this_address);
@@ -3351,7 +3317,6 @@ xlog_do_recover(
}
/* Convert superblock from on-disk format */
- sbp = &mp->m_sb;
xfs_sb_from_disk(sbp, bp->b_addr);
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c8ae49a1e99c..150ee5cb8645 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -80,9 +80,9 @@ xfs_uuid_mount(
}
if (hole < 0) {
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- 0);
+ GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -1059,11 +1059,12 @@ xfs_unmountfs(
* We can potentially deadlock here if we have an inode cluster
* that has been freed has its buffer still pinned in memory because
* the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will have their flush locks held until the
- * transaction hits the disk and the callbacks run. the inode
- * flush takes the flush lock unconditionally and with nothing to
- * push out the iclog we will never get that unlocked. hence we
- * need to force the log first.
+ * on that buffer will be pinned to the buffer until the
+ * transaction hits the disk and the callbacks run. Pushing the AIL will
+ * skip the stale inodes and may never see the pinned buffer, so
+ * nothing will push out the iclog and unpin the buffer. Hence we
+ * need to force the log here to ensure all items are flushed into the
+ * AIL before we go any further.
*/
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -1289,23 +1290,6 @@ xfs_mod_frextents(
}
/*
- * xfs_getsb() is called to obtain the buffer for the superblock.
- * The buffer is returned locked and read in from disk.
- * The buffer should be released with a call to xfs_brelse().
- */
-struct xfs_buf *
-xfs_getsb(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp = mp->m_sb_bp;
-
- xfs_buf_lock(bp);
- xfs_buf_hold(bp);
- ASSERT(bp->b_flags & XBF_DONE);
- return bp;
-}
-
-/*
* Used to free the superblock along various error paths.
*/
void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a72cfcaa4ad1..dfa429b77ee2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 5f04d8a5ab2a..0aa87c210104 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -15,6 +15,10 @@
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
+#define XFS_CHECK_VALUE(value, expected) \
+ BUILD_BUG_ON_MSG((value) != (expected), \
+ "XFS: value of " #value " is wrong, expected " #expected)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
@@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
@@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
@@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
@@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
+
+ /*
+ * Make sure the incore inode timestamp range corresponds to hand
+ * converted values based on the ondisk format specification.
+ */
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET,
+ XFS_LEGACY_TIME_MIN);
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET,
+ 16299260424LL);
+
+ /* Do the same with the incore quota expiration range. */
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
+ 16299260424LL);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be67570badf8..b2a9abee8b2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -249,7 +249,6 @@ xfs_qm_unmount_quotas(
STATIC int
xfs_qm_dqattach_one(
struct xfs_inode *ip,
- xfs_dqid_t id,
xfs_dqtype_t type,
bool doalloc,
struct xfs_dquot **IO_idqpp)
@@ -330,23 +329,23 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)),
- XFS_DQTYPE_USER, doalloc, &ip->i_udquot);
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
+ doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)),
- XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot);
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
+ doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -661,6 +660,17 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ qinf->qi_expiry_min =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
+ qinf->qi_expiry_max =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX);
+ } else {
+ qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN;
+ qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX;
+ }
+ trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min,
+ qinf->qi_expiry_max);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -879,6 +889,8 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
}
if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -1650,6 +1662,7 @@ xfs_qm_vop_dqalloc(
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+ ASSERT(O_udqpp);
if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
@@ -1683,6 +1696,7 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+ ASSERT(O_gdqpp);
if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
@@ -1700,9 +1714,10 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ ASSERT(O_pdqpp);
if (ip->i_d.di_projid != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, prid,
XFS_DQTYPE_PROJ, true, &pq);
if (error) {
ASSERT(error != -ENOENT);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9c078c35d924..e3dabab44097 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -65,6 +65,10 @@ struct xfs_quotainfo {
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
struct shrinker qi_shrinker;
+
+ /* Minimum and maximum quota expiration timestamp values. */
+ time64_t qi_expiry_min;
+ time64_t qi_expiry_max;
};
static inline struct radix_tree_root *
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 1c542b4a5220..ca1b57d291dc 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -479,13 +479,19 @@ xfs_setqlim_warns(
static inline void
xfs_setqlim_timer(
+ struct xfs_mount *mp,
struct xfs_dquot_res *res,
struct xfs_quota_limits *qlim,
s64 timer)
{
- res->timer = timer;
- if (qlim)
- qlim->time = timer;
+ if (qlim) {
+ /* Set the length of the default grace period. */
+ res->timer = xfs_dquot_set_grace_period(timer);
+ qlim->time = res->timer;
+ } else {
+ /* Set the grace period expiration on a quota. */
+ res->timer = xfs_dquot_set_timeout(mp, timer);
+ }
}
/*
@@ -574,7 +580,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_SPC_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_spc_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
/* Blocks on the realtime device. */
hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
@@ -590,7 +596,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
/* Inodes */
hard = (newlim->d_fieldmask & QC_INO_HARD) ?
@@ -606,7 +612,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_INO_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_ino_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
if (id != 0) {
/*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 06b22e35fc90..5a62398940d0 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-void xfs_dquot_done(struct xfs_buf *);
-
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
-
-static inline void xfs_dquot_done(struct xfs_buf *bp)
-{
- return;
-}
-
#endif /* CONFIG_XFS_QUOTA */
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index ca93b6488377..7529eb63ce94 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
STATIC int
xfs_cui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_bmbt_irec irec;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
@@ -432,7 +432,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
@@ -467,14 +467,8 @@ xfs_cui_item_recover(
refc->pe_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
refc->pe_len >= mp->m_sb.sb_agblocks ||
- (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
- /*
- * This will pull the CUI from the AIL and
- * free the memory associated with it.
- */
- xfs_cui_release(cuip);
+ (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
}
/*
@@ -493,12 +487,7 @@ xfs_cui_item_recover(
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -555,13 +544,10 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
@@ -574,6 +560,32 @@ xfs_cui_item_match(
return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_cud_log_item *cudp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_phys_extent *extp;
+ unsigned int count;
+
+ count = CUI_ITEM(intent)->cui_format.cui_nextents;
+ extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ atomic_set(&cuip->cui_next_extent, count);
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ return &cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
@@ -581,6 +593,7 @@ static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_release = xfs_cui_item_release,
.iop_recover = xfs_cui_item_recover,
.iop_match = xfs_cui_item_match,
+ .iop_relog = xfs_cui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index dc5b0753cd51..7adc996ca6e3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
STATIC int
xfs_rui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
@@ -511,14 +511,8 @@ xfs_rui_item_recover(
rmap->me_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
rmap->me_len >= mp->m_sb.sb_agblocks ||
- (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the RUI from the AIL and
- * free the memory associated with it.
- */
- xfs_rui_release(ruip);
+ (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
@@ -573,8 +567,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
@@ -590,6 +583,32 @@ xfs_rui_item_match(
return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_rud_log_item *rudp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = RUI_ITEM(intent)->rui_format.rui_nextents;
+ extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ atomic_set(&ruip->rui_next_extent, count);
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ return &ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
@@ -597,6 +616,7 @@ static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_release = xfs_rui_item_release,
.iop_recover = xfs_rui_item_recover,
.iop_match = xfs_rui_item_match,
+ .iop_relog = xfs_rui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6209e7b6b895..ede1baf31413 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -18,7 +18,7 @@
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_sb.h"
/*
* Read and return the summary information for a given extent size,
@@ -247,6 +247,9 @@ xfs_rtallocate_extent_block(
end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
i <= end;
i++) {
+ /* Make sure we don't scan off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
@@ -442,6 +445,14 @@ xfs_rtallocate_extent_near(
*/
if (bno >= mp->m_sb.sb_rextents)
bno = mp->m_sb.sb_rextents - 1;
+
+ /* Make sure we don't run off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+ if (maxlen < minlen) {
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+
/*
* Try the exact allocation first.
*/
@@ -767,8 +778,14 @@ xfs_growfs_rt_alloc(
struct xfs_bmbt_irec map; /* block map output */
int nmap; /* number of block maps */
int resblks; /* space reservation */
+ enum xfs_blft buf_type;
struct xfs_trans *tp;
+ if (ip == mp->m_rsumip)
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ else
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+
/*
* Allocate space to the file, as necessary.
*/
@@ -830,6 +847,9 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0, &bp);
if (error)
goto out_trans_cancel;
+
+ xfs_trans_buf_set_type(tp, bp, buf_type);
+ bp->b_ops = &xfs_rtbuf_ops;
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
@@ -862,7 +882,7 @@ xfs_alloc_rsum_cache(
* lower bound on the minimum level with any free extents. We can
* continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+ mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
if (!mp->m_rsum_cache)
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -1004,23 +1024,29 @@ xfs_growfs_rt(
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
- * Update the bitmap inode's size.
+ * Update the bitmap inode's size ondisk and incore. We need
+ * to update the incore size so that inode inactivation won't
+ * punch what it thinks are "posteof" blocks.
*/
mp->m_rbmip->i_d.di_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+ i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
/*
* Get the summary inode into the transaction.
*/
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
- * Update the summary inode's size.
+ * Update the summary inode's size. We need to update the
+ * incore size so that inode inactivation won't punch what it
+ * thinks are "posteof" blocks.
*/
mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+ i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size);
xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
/*
* Copy summary data from old to new sizes.
@@ -1076,7 +1102,13 @@ error_cancel:
if (error)
break;
}
+ if (error)
+ goto out_free;
+
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+out_free:
/*
* Free the fake mp structure.
*/
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f70f1255220b..20e0534a772c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
uint64_t xs_xstrat_bytes = 0;
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
+ uint64_t defer_relog = 0;
static const struct xstats_entry {
char *desc;
@@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+ defer_relog);
len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 34d704f703d2..43ffba74f045 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -137,6 +137,7 @@ struct __xfsstats {
uint64_t xs_xstrat_bytes;
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
+ uint64_t defer_relog;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 71ac6c1cdc36..d1b5f2d2a245 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -654,11 +654,11 @@ xfs_fs_destroy_inode(
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
- * We always use background reclaim here because even if the
- * inode is clean, it still may be under IO and hence we have
- * to take the flush lock. The background reclaim path handles
- * this more efficiently than we can here, so simply let background
- * reclaim tear down all inodes.
+ * We always use background reclaim here because even if the inode is
+ * clean, it still may be under IO and hence we have wait for IO
+ * completion to occur before we can reclaim the inode. The background
+ * reclaim path handles this more efficiently than we can here, so
+ * simply let background reclaim tear down all inodes.
*/
xfs_inode_set_reclaim_tag(ip);
}
@@ -1234,25 +1234,12 @@ xfs_fc_parse_param(
case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
return 0;
- case Opt_ikeep:
- mp->m_flags |= XFS_MOUNT_IKEEP;
- return 0;
- case Opt_noikeep:
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
- return 0;
case Opt_largeio:
mp->m_flags |= XFS_MOUNT_LARGEIO;
return 0;
case Opt_nolargeio:
mp->m_flags &= ~XFS_MOUNT_LARGEIO;
return 0;
- case Opt_attr2:
- mp->m_flags |= XFS_MOUNT_ATTR2;
- return 0;
- case Opt_noattr2:
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
- return 0;
case Opt_filestreams:
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
return 0;
@@ -1304,6 +1291,24 @@ xfs_fc_parse_param(
xfs_mount_set_dax_mode(mp, result.uint_32);
return 0;
#endif
+ /* Following mount options will be removed in September 2025 */
+ case Opt_ikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_noikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_attr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ return 0;
+ case Opt_noattr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+ return 0;
default:
xfs_warn(mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1450,6 +1455,19 @@ xfs_fc_fill_super(
if (error)
goto out_free_sb;
+ /* V4 support is undergoing deprecation. */
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+#ifdef CONFIG_XFS_SUPPORT_V4
+ xfs_warn_once(mp,
+ "Deprecated V4 format (crc=0) will not be supported after September 2030.");
+#else
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+#endif
+ }
+
/*
* XFS block mappings use 54 bits to store the logical block offset.
* This should suffice to handle the maximum file size that the VFS
@@ -1484,8 +1502,14 @@ xfs_fc_fill_super(
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
- sb->s_time_min = S32_MIN;
- sb->s_time_max = S32_MAX;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
+ sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
+ } else {
+ sb->s_time_min = XFS_LEGACY_TIME_MIN;
+ sb->s_time_max = XFS_LEGACY_TIME_MAX;
+ }
+ trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
sb->s_iflags |= SB_I_CGROUPWB;
set_posix_acl_flag(sb);
@@ -1494,6 +1518,10 @@ xfs_fc_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= SB_I_VERSION;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL big timestamp feature in use. Use at your own risk!");
+
if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) {
bool rtdev_is_dax = false, datadev_is_dax;
@@ -1549,6 +1577,10 @@ xfs_fc_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 021ef96d0542..fac9de7ee6d0 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,13 +50,45 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
+STATIC int
+xfs_deprecate_irix_sgid_inherit_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
+STATIC int
+xfs_deprecate_irix_symlink_mode_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -65,7 +97,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index abb1d859f226..86951652d3ed 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
@@ -2533,6 +2533,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -3676,7 +3677,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \
DEFINE_KMEM_EVENT(kmem_alloc);
DEFINE_KMEM_EVENT(kmem_alloc_io);
DEFINE_KMEM_EVENT(kmem_alloc_large);
-DEFINE_KMEM_EVENT(kmem_realloc);
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
@@ -3844,6 +3844,32 @@ TRACE_EVENT(xfs_btree_bload_block,
__entry->nr_records)
)
+DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
+ TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
+ TP_ARGS(mp, min, max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(long long, min)
+ __field(long long, max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->min = min;
+ __entry->max = max;
+ ),
+ TP_printk("dev %d:%d min %lld max %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->min,
+ __entry->max)
+)
+
+#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
+DEFINE_EVENT(xfs_timestamp_range_class, name, \
+ TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
+ TP_ARGS(mp, min, max))
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ed72867b1a19..c94e71f741b6 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas(
xfs_buf_t *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp);
+ bp = xfs_trans_getsb(tp);
sbp = bp->b_addr;
/*
@@ -959,7 +959,7 @@ xfs_trans_cancel(
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
- ASSERT(!(lip->li_type == XFS_LI_EFD));
+ ASSERT(!xlog_item_is_intent_done(lip));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b752501818d2..084658946cc8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,14 +55,12 @@ struct xfs_log_item {
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
-#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */
#define XFS_LI_FLAGS \
{ (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
{ (1 << XFS_LI_ABORTED), "ABORTED" }, \
{ (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }, \
- { (1 << XFS_LI_RECOVERED), "RECOVERED" }
+ { (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
unsigned flags;
@@ -74,10 +72,29 @@ struct xfs_item_ops {
void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
- int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
+ int (*iop_recover)(struct xfs_log_item *lip,
+ struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
+ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+ struct xfs_trans *tp);
};
+/* Is this log item a deferred action intent? */
+static inline bool
+xlog_item_is_intent(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_recover != NULL &&
+ lip->li_ops->iop_match != NULL;
+}
+
+/* Is this a log intent-done item? */
+static inline bool
+xlog_item_is_intent_done(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_unpin == NULL &&
+ lip->li_ops->iop_push == NULL;
+}
+
/*
* Release the log item as soon as committed. This is for items just logging
* intents that never need to be written back in place.
@@ -209,7 +226,7 @@ xfs_trans_read_buf(
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
+struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -243,4 +260,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+ struct xfs_log_item *lip,
+ struct xfs_trans *tp)
+{
+ return lip->li_ops->iop_relog(lip, tp);
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 11cd666cd99a..42d63b830cb9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,50 +166,34 @@ xfs_trans_get_buf_map(
}
/*
- * Get and lock the superblock buffer of this file system for the
- * given transaction.
- *
- * We don't need to use incore_match() here, because the superblock
- * buffer is a private buffer which we keep a pointer to in the
- * mount structure.
+ * Get and lock the superblock buffer for the given transaction.
*/
-xfs_buf_t *
+struct xfs_buf *
xfs_trans_getsb(
- xfs_trans_t *tp,
- struct xfs_mount *mp)
+ struct xfs_trans *tp)
{
- xfs_buf_t *bp;
- struct xfs_buf_log_item *bip;
+ struct xfs_buf *bp = tp->t_mountp->m_sb_bp;
/*
- * Default to just trying to lock the superblock buffer
- * if tp is NULL.
+ * Just increment the lock recursion count if the buffer is already
+ * attached to this transaction.
*/
- if (tp == NULL)
- return xfs_getsb(mp);
-
- /*
- * If the superblock buffer already has this transaction
- * pointer in its b_fsprivate2 field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the buffer to the caller.
- */
- bp = mp->m_sb_bp;
if (bp->b_transp == tp) {
- bip = bp->b_log_item;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
+
trace_xfs_trans_getsb_recur(bip);
- return bp;
- }
+ } else {
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ _xfs_trans_bjoin(tp, bp, 1);
- bp = xfs_getsb(mp);
- if (bp == NULL)
- return NULL;
+ trace_xfs_trans_getsb(bp->b_log_item);
+ }
- _xfs_trans_bjoin(tp, bp, 1);
- trace_xfs_trans_getsb(bp->b_log_item);
return bp;
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c6ba7ef18e06..fe45b0c3970c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -55,6 +55,12 @@ xfs_trans_log_dquot(
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ /* Upgrade the dquot to bigtime format if possible. */
+ if (dqp->q_id != 0 &&
+ xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) &&
+ !(dqp->q_type & XFS_DQTYPE_BIGTIME))
+ dqp->q_type |= XFS_DQTYPE_BIGTIME;
+
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
@@ -215,36 +221,27 @@ xfs_trans_mod_dquot(
}
switch (field) {
-
- /*
- * regular disk blk reservation
- */
- case XFS_TRANS_DQ_RES_BLKS:
+ /* regular disk blk reservation */
+ case XFS_TRANS_DQ_RES_BLKS:
qtrx->qt_blk_res += delta;
break;
- /*
- * inode reservation
- */
- case XFS_TRANS_DQ_RES_INOS:
+ /* inode reservation */
+ case XFS_TRANS_DQ_RES_INOS:
qtrx->qt_ino_res += delta;
break;
- /*
- * disk blocks used.
- */
- case XFS_TRANS_DQ_BCOUNT:
+ /* disk blocks used. */
+ case XFS_TRANS_DQ_BCOUNT:
qtrx->qt_bcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
qtrx->qt_delbcnt_delta += delta;
break;
- /*
- * Inode Count
- */
- case XFS_TRANS_DQ_ICOUNT:
+ /* Inode Count */
+ case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
@@ -252,17 +249,13 @@ xfs_trans_mod_dquot(
qtrx->qt_icount_delta += delta;
break;
- /*
- * rtblk reservation
- */
- case XFS_TRANS_DQ_RES_RTBLKS:
+ /* rtblk reservation */
+ case XFS_TRANS_DQ_RES_RTBLKS:
qtrx->qt_rtblk_res += delta;
break;
- /*
- * rtblk count
- */
- case XFS_TRANS_DQ_RTBCOUNT:
+ /* rtblk count */
+ case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
@@ -270,11 +263,11 @@ xfs_trans_mod_dquot(
qtrx->qt_rtbcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELRTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
qtrx->qt_delrtb_delta += delta;
break;
- default:
+ default:
ASSERT(0);
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8ec7c8f109d7..64cc2a9c38c8 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,39 @@
#include "zonefs.h"
+static inline int zonefs_zone_mgmt(struct inode *inode,
+ enum req_opf op)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret;
+
+ lockdep_assert_held(&zi->i_truncate_mutex);
+
+ ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
+ zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
+ if (ret) {
+ zonefs_err(inode->i_sb,
+ "Zone management operation %s at %llu failed %d\n",
+ blk_op_str(op), zi->i_zsector, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ i_size_write(inode, isize);
+ /*
+ * A full zone is no longer open/active and does not need
+ * explicit closing.
+ */
+ if (isize >= zi->i_max_size)
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+}
+
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
@@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
}
/*
+ * If the filesystem is mounted with the explicit-open mount option, we
+ * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
+ * the read-only or offline condition, to avoid attempting an explicit
+ * close of the zone when the inode file is closed.
+ */
+ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
+ (zone->cond == BLK_ZONE_COND_OFFLINE ||
+ zone->cond == BLK_ZONE_COND_READONLY))
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+
+ /*
* If error=remount-ro was specified, any error result in remounting
* the volume as read-only.
*/
@@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* invalid data.
*/
zonefs_update_stats(inode, data_size);
- i_size_write(inode, data_size);
+ zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
return 0;
@@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* eventually correct the file size and zonefs inode write pointer offset
* (which can be out of sync with the drive due to partial write failures).
*/
-static void zonefs_io_error(struct inode *inode, bool write)
+static void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write)
};
int ret;
- mutex_lock(&zi->i_truncate_mutex);
-
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
* reclaim which may in turn cause a recursion into zonefs as well as
@@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write)
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
memalloc_noio_restore(noio_flag);
+}
+static void zonefs_io_error(struct inode *inode, bool write)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ mutex_lock(&zi->i_truncate_mutex);
+ __zonefs_io_error(inode, write);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
if (isize == old_isize)
goto unlock;
- ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
- zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
- if (ret) {
- zonefs_err(inode->i_sb,
- "Zone management operation at %llu failed %d",
- zi->i_zsector, ret);
+ ret = zonefs_zone_mgmt(inode, op);
+ if (ret)
goto unlock;
+
+ /*
+ * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
+ * take care of open zones.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN) {
+ /*
+ * Truncating a zone to EMPTY or FULL is the equivalent of
+ * closing the zone. For a truncation to 0, we need to
+ * re-open the zone to ensure new writes can be processed.
+ * For a truncation to the maximum file size, the zone is
+ * closed and writes cannot be accepted anymore, so clear
+ * the open flag.
+ */
+ if (!isize)
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ else
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
}
zonefs_update_stats(inode, isize);
@@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
mutex_lock(&zi->i_truncate_mutex);
if (i_size_read(inode) < iocb->ki_pos + size) {
zonefs_update_stats(inode, iocb->ki_pos + size);
- i_size_write(inode, iocb->ki_pos + size);
+ zonefs_i_size_write(inode, iocb->ki_pos + size);
}
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -865,8 +928,128 @@ inode_unlock:
return ret;
}
+static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+
+ if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
+ return false;
+
+ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ return false;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return false;
+
+ return true;
+}
+
+static int zonefs_open_zone(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ zi->i_wr_refcnt++;
+ if (zi->i_wr_refcnt == 1) {
+
+ if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
+ atomic_dec(&sbi->s_open_zones);
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (i_size_read(inode) < zi->i_max_size) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ if (ret) {
+ zi->i_wr_refcnt--;
+ atomic_dec(&sbi->s_open_zones);
+ goto unlock;
+ }
+ zi->i_flags |= ZONEFS_ZONE_OPEN;
+ }
+ }
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return ret;
+}
+
+static int zonefs_file_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+
+ if (zonefs_file_use_exp_open(inode, file))
+ return zonefs_open_zone(inode);
+
+ return 0;
+}
+
+static void zonefs_close_zone(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+ zi->i_wr_refcnt--;
+ if (!zi->i_wr_refcnt) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * If the file zone is full, it is not open anymore and we only
+ * need to decrement the open count.
+ */
+ if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
+ goto dec;
+
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ if (ret) {
+ __zonefs_io_error(inode, false);
+ /*
+ * Leaving zones explicitly open may lead to a state
+ * where most zones cannot be written (zone resources
+ * exhausted). So take preventive action by remounting
+ * read-only.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN &&
+ !(sb->s_flags & SB_RDONLY)) {
+ zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ }
+ }
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+dec:
+ atomic_dec(&sbi->s_open_zones);
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+}
+
+static int zonefs_file_release(struct inode *inode, struct file *file)
+{
+ /*
+ * If we explicitly open a zone we must close it again as well, but the
+ * zone management operation can fail (either due to an IO error or as
+ * the zone has gone offline or read-only). Make sure we don't fail the
+ * close(2) for user-space.
+ */
+ if (zonefs_file_use_exp_open(inode, file))
+ zonefs_close_zone(inode);
+
+ return 0;
+}
+
static const struct file_operations zonefs_file_operations = {
- .open = generic_file_open,
+ .open = zonefs_file_open,
+ .release = zonefs_file_release,
.fsync = zonefs_file_fsync,
.mmap = zonefs_file_mmap,
.llseek = zonefs_file_llseek,
@@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
init_rwsem(&zi->i_mmap_sem);
+ zi->i_wr_refcnt = 0;
return &zi->i_vnode;
}
@@ -940,7 +1124,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
enum {
Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
- Opt_err,
+ Opt_explicit_open, Opt_err,
};
static const match_table_t tokens = {
@@ -948,6 +1132,7 @@ static const match_table_t tokens = {
{ Opt_errors_zro, "errors=zone-ro"},
{ Opt_errors_zol, "errors=zone-offline"},
{ Opt_errors_repair, "errors=repair"},
+ { Opt_explicit_open, "explicit-open" },
{ Opt_err, NULL}
};
@@ -984,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options)
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
break;
+ case Opt_explicit_open:
+ sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ break;
default:
return -EINVAL;
}
@@ -1403,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+ sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
+ atomic_set(&sbi->s_open_zones, 0);
+ if (!sbi->s_max_open_zones &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
ret = zonefs_read_super(sb);
if (ret)
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 55b39970acb2..51141907097c 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
return ZONEFS_ZTYPE_SEQ;
}
+#define ZONEFS_ZONE_OPEN (1 << 0)
+
/*
* In-memory inode data.
*/
@@ -74,6 +76,10 @@ struct zonefs_inode_info {
*/
struct mutex i_truncate_mutex;
struct rw_semaphore i_mmap_sem;
+
+ /* guarded by i_truncate_mutex */
+ unsigned int i_wr_refcnt;
+ unsigned int i_flags;
};
static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
@@ -154,6 +160,7 @@ enum zonefs_features {
#define ZONEFS_MNTOPT_ERRORS_MASK \
(ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \
ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR)
+#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */
/*
* In-memory Super block information.
@@ -175,6 +182,9 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
+
+ unsigned int s_max_open_zones;
+ atomic_t s_open_zones;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)