aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/cell.c1
-rw-r--r--fs/afs/internal.h9
-rw-r--r--fs/afs/mntpt.c149
-rw-r--r--fs/afs/super.c432
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/aio.c94
-rw-r--r--fs/autofs/autofs_i.h3
-rw-r--r--fs/autofs/inode.c19
-rw-r--r--fs/binfmt_aout.c83
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_script.c57
-rw-r--r--fs/block_dev.c28
-rw-r--r--fs/btrfs/acl.c9
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/backref.c22
-rw-r--r--fs/btrfs/compression.c256
-rw-r--r--fs/btrfs/compression.h52
-rw-r--r--fs/btrfs/ctree.c74
-rw-r--r--fs/btrfs/ctree.h97
-rw-r--r--fs/btrfs/delayed-ref.c15
-rw-r--r--fs/btrfs/delayed-ref.h11
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c44
-rw-r--r--fs/btrfs/extent-tree.c364
-rw-r--r--fs/btrfs/extent_io.c103
-rw-r--r--fs/btrfs/extent_io.h15
-rw-r--r--fs/btrfs/extent_map.c5
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c213
-rw-r--r--fs/btrfs/ioctl.c81
-rw-r--r--fs/btrfs/locking.c108
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c31
-rw-r--r--fs/btrfs/qgroup.c385
-rw-r--r--fs/btrfs/qgroup.h120
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/relocation.c119
-rw-r--r--fs/btrfs/root-tree.c12
-rw-r--r--fs/btrfs/scrub.c49
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c282
-rw-r--r--fs/btrfs/volumes.c202
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zlib.c45
-rw-r--r--fs/btrfs/zstd.c318
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/ceph/caps.c72
-rw-r--r--fs/ceph/debugfs.c27
-rw-r--r--fs/ceph/dir.c455
-rw-r--r--fs/ceph/file.c13
-rw-r--r--fs/ceph/inode.c52
-rw-r--r--fs/ceph/mds_client.c698
-rw-r--r--fs/ceph/mds_client.h43
-rw-r--r--fs/ceph/snap.c162
-rw-r--r--fs/ceph/super.c21
-rw-r--r--fs/ceph/super.h43
-rw-r--r--fs/ceph/xattr.c20
-rw-r--r--fs/cifs/Kconfig120
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h65
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/cifssmb.c54
-rw-r--r--fs/cifs/connect.c66
-rw-r--r--fs/cifs/file.c269
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/link.c14
-rw-r--r--fs/cifs/smb1ops.c8
-rw-r--r--fs/cifs/smb2misc.c24
-rw-r--r--fs/cifs/smb2ops.c115
-rw-r--r--fs/cifs/smb2pdu.c179
-rw-r--r--fs/cifs/smb2pdu.h4
-rw-r--r--fs/cifs/smb2transport.c25
-rw-r--r--fs/cifs/smbdirect.c6
-rw-r--r--fs/cifs/trace.h89
-rw-r--r--fs/cifs/transport.c208
-rw-r--r--fs/crypto/Kconfig6
-rw-r--r--fs/crypto/bio.c3
-rw-r--r--fs/crypto/fscrypt_private.h1
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keyinfo.c4
-rw-r--r--fs/crypto/policy.c3
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/exec.c15
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild20
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/Kconfig.ore14
-rw-r--r--fs/exofs/common.h262
-rw-r--r--fs/exofs/dir.c661
-rw-r--r--fs/exofs/exofs.h240
-rw-r--r--fs/exofs/file.c83
-rw-r--r--fs/exofs/inode.c1514
-rw-r--r--fs/exofs/namei.c323
-rw-r--r--fs/exofs/ore.c1178
-rw-r--r--fs/exofs/ore_raid.c756
-rw-r--r--fs/exofs/ore_raid.h62
-rw-r--r--fs/exofs/super.c1071
-rw-r--r--fs/exofs/sys.c205
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h17
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c44
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h21
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c33
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c6
-rw-r--r--fs/ext4/inode.c45
-rw-r--r--fs/ext4/ioctl.c105
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c18
-rw-r--r--fs/ext4/page-io.c16
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c7
-rw-r--r--fs/ext4/sysfs.c17
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/data.c13
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/dir.c10
-rw-r--r--fs/f2fs/f2fs.h18
-rw-r--r--fs/f2fs/file.c10
-rw-r--r--fs/f2fs/inode.c4
-rw-r--r--fs/f2fs/namei.c6
-rw-r--r--fs/f2fs/super.c13
-rw-r--r--fs/f2fs/sysfs.c4
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/file.c16
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs_context.c642
-rw-r--r--fs/fs_parser.c447
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c7
-rw-r--r--fs/fuse/dev.c115
-rw-r--r--fs/fuse/dir.c54
-rw-r--r--fs/fuse/file.c342
-rw-r--r--fs/fuse/fuse_i.h28
-rw-r--r--fs/fuse/inode.c28
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c72
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/lops.c6
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c372
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h13
-rw-r--r--fs/io_uring.c2971
-rw-r--r--fs/iomap.c53
-rw-r--r--fs/jbd2/checkpoint.c17
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c90
-rw-r--r--fs/jbd2/transaction.c83
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h3
-rw-r--r--fs/kernfs/mount.c134
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/locks.c32
-rw-r--r--fs/mount.h5
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c9
-rw-r--r--fs/namespace.c401
-rw-r--r--fs/nfs/callback_xdr.c64
-rw-r--r--fs/nfs/delegation.c22
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c98
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/file.c44
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c225
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h75
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c161
-rw-r--r--fs/nfs/inode.c33
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/io.c12
-rw-r--r--fs/nfs/namespace.c8
-rw-r--r--fs/nfs/nfs2xdr.c124
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c209
-rw-r--r--fs/nfs/nfs42.h3
-rw-r--r--fs/nfs/nfs42proc.c164
-rw-r--r--fs/nfs/nfs42xdr.c130
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4idmap.c31
-rw-r--r--fs/nfs/nfs4namespace.c5
-rw-r--r--fs/nfs/nfs4proc.c138
-rw-r--r--fs/nfs/nfs4session.c7
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfs4trace.h25
-rw-r--r--fs/nfs/nfs4xdr.c530
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h85
-rw-r--r--fs/nfs/pagelist.c47
-rw-r--r--fs/nfs/pnfs.c33
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_dev.c13
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/unlink.c8
-rw-r--r--fs/nfs/write.c19
-rw-r--r--fs/nfsd/nfs3proc.c18
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfs4callback.c17
-rw-r--r--fs/nfsd/nfs4state.c8
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c267
-rw-r--r--fs/notify/fanotify/fanotify.h116
-rw-r--r--fs/notify/fanotify/fanotify_user.c373
-rw-r--r--fs/notify/fsnotify.c15
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/notify/mark.c42
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/util.c55
-rw-r--r--fs/pipe.c35
-rw-r--r--fs/pnode.c5
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c138
-rw-r--r--fs/proc/inode.c52
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/root.c238
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c89
-rw-r--r--fs/proc/task_mmu.c10
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/pstore/platform.c3
-rw-r--r--fs/pstore/ram.c64
-rw-r--r--fs/read_write.c16
-rw-r--r--fs/select.c4
-rw-r--r--fs/splice.c22
-rw-r--r--fs/stat.c12
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/super.c344
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/mount.c73
-rw-r--r--fs/timerfd.c4
-rw-r--r--fs/ubifs/Kconfig12
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/super.c51
-rw-r--r--fs/utimes.c10
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c302
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c3
-rw-r--r--fs/xfs/libxfs/xfs_types.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/scrub/agheader.c10
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c330
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_aops.c275
-rw-r--r--fs/xfs/xfs_aops.h24
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c72
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c32
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c769
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c150
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h115
-rw-r--r--fs/xfs/xfs_trans_bmap.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_refcount.c1
-rw-r--r--fs/xfs/xfs_trans_rmap.c1
-rw-r--r--fs/xfs/xfs_xattr.c3
371 files changed, 15693 insertions, 12891 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..10d3bd3f534b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
return ret;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..3e6d3101f3ff 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -8,6 +8,13 @@ menu "File systems"
config DCACHE_WORD_ACCESS
bool
+config VALIDATE_FS_PARSER
+ bool "Validate filesystem parameter description"
+ default y
+ help
+ Enable this to perform validation of the parameter description for a
+ filesystem when it is registered.
+
if BLOCK
config FS_IOMAP
@@ -254,12 +261,9 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
-source "fs/exofs/Kconfig"
endif # MISC_FILESYSTEMS
-source "fs/exofs/Kconfig.ore"
-
menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
default y
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..427fec226fae 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+ fs_types.o fs_context.o fs_parser.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -30,6 +31,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
@@ -124,7 +126,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
-obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index cf445dbd5f2e..9de46116c749 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
} else {
cell->dns_expiry = ktime_get_real_seconds();
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8871b9e8645f..bb1f244b2b3a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -36,15 +36,14 @@
struct pagevec;
struct afs_call;
-struct afs_mount_params {
- bool rwpath; /* T if the parent should be considered R/W */
+struct afs_fs_context {
bool force; /* T to force cell type */
bool autocell; /* T if set auto mount operation */
bool dyn_root; /* T if dynamic root */
+ bool no_cell; /* T if the source is "none" (for dynroot) */
afs_voltype_t type; /* type of volume requested */
- int volnamesz; /* size of volume name */
+ unsigned int volnamesz; /* size of volume name */
const char *volname; /* name of volume to mount */
- struct net *net_ns; /* Network namespace in effect */
struct afs_net *net; /* the AFS net namespace stuff */
struct afs_cell *cell; /* cell in which to find volume */
struct afs_volume *volume; /* volume record */
@@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
return volume;
}
-extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2e51c6994148..eecd8b699186 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,6 +17,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/gfp.h>
+#include <linux/fs_context.h>
#include "internal.h"
@@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+static const char afs_root_volume[] = "root.cell";
+
/*
* no valid lookup procedure on this sort of dir
*/
@@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
}
/*
- * create a vfsmount to be automounted
+ * Set the parameters for the proposed superblock.
*/
-static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
{
- struct afs_super_info *as;
- struct vfsmount *mnt;
- struct afs_vnode *vnode;
- struct page *page;
- char *devname, *options;
- bool rwpath = false;
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt));
+ struct afs_cell *cell;
+ const char *p;
int ret;
- _enter("{%pd}", mntpt);
-
- BUG_ON(!d_inode(mntpt));
-
- ret = -ENOMEM;
- devname = (char *) get_zeroed_page(GFP_KERNEL);
- if (!devname)
- goto error_no_devname;
-
- options = (char *) get_zeroed_page(GFP_KERNEL);
- if (!options)
- goto error_no_options;
+ if (fc->net_ns != src_as->net_ns) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(src_as->net_ns);
+ }
- vnode = AFS_FS_I(d_inode(mntpt));
+ if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
+ }
+ if (ctx->cell) {
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = NULL;
+ }
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
/* if the directory is a pseudo directory, use the d_name */
- static const char afs_root_cell[] = ":root.cell.";
unsigned size = mntpt->d_name.len;
- ret = -ENOENT;
- if (size < 2 || size > AFS_MAXCELLNAME)
- goto error_no_page;
+ if (size < 2)
+ return -ENOENT;
+ p = mntpt->d_name.name;
if (mntpt->d_name.name[0] == '.') {
- devname[0] = '%';
- memcpy(devname + 1, mntpt->d_name.name + 1, size - 1);
- memcpy(devname + size, afs_root_cell,
- sizeof(afs_root_cell));
- rwpath = true;
- } else {
- devname[0] = '#';
- memcpy(devname + 1, mntpt->d_name.name, size);
- memcpy(devname + size + 1, afs_root_cell,
- sizeof(afs_root_cell));
+ size--;
+ p++;
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
+ if (size > AFS_MAXCELLNAME)
+ return -ENAMETOOLONG;
+
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ if (IS_ERR(cell)) {
+ pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
+ return PTR_ERR(cell);
+ }
+ ctx->cell = cell;
+
+ ctx->volname = afs_root_volume;
+ ctx->volnamesz = sizeof(afs_root_volume) - 1;
} else {
/* read the contents of the AFS special symlink */
+ struct page *page;
loff_t size = i_size_read(d_inode(mntpt));
char *buf;
- ret = -EINVAL;
+ if (src_as->cell)
+ ctx->cell = afs_get_cell(src_as->cell);
+
if (size > PAGE_SIZE - 1)
- goto error_no_page;
+ return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto error_no_page;
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
if (PageError(page)) {
ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- goto error;
+ put_page(page);
+ return ret;
}
- buf = kmap_atomic(page);
- memcpy(devname, buf, size);
- kunmap_atomic(buf);
+ buf = kmap(page);
+ ret = vfs_parse_fs_string(fc, "source", buf, size);
+ kunmap(page);
put_page(page);
- page = NULL;
+ if (ret < 0)
+ return ret;
}
- /* work out what options we want */
- as = AFS_FS_S(mntpt->d_sb);
- if (as->cell) {
- memcpy(options, "cell=", 5);
- strcpy(options + 5, as->cell->name);
- if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath)
- strcat(options, ",rwpath");
- }
+ return 0;
+}
- /* try and do the mount */
- _debug("--- attempting mount %s -o %s ---", devname, options);
- mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
- _debug("--- mount result %p ---", mnt);
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret;
- free_page((unsigned long) devname);
- free_page((unsigned long) options);
- _leave(" = %p", mnt);
- return mnt;
+ BUG_ON(!d_inode(mntpt));
-error:
- put_page(page);
-error_no_page:
- free_page((unsigned long) options);
-error_no_options:
- free_page((unsigned long) devname);
-error_no_devname:
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ fc = fs_context_for_submount(&afs_fs_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = afs_mntpt_set_params(fc, mntpt);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
}
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index dcd07fe99871..5adf012b8e27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,6 +1,6 @@
/* AFS superblock handling
*
- * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
#include <linux/nsproxy.h>
@@ -30,21 +30,22 @@
#include "internal.h"
static void afs_i_init_once(void *foo);
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data);
static void afs_kill_super(struct super_block *sb);
static struct inode *afs_alloc_inode(struct super_block *sb);
static void afs_destroy_inode(struct inode *inode);
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int afs_show_devname(struct seq_file *m, struct dentry *root);
static int afs_show_options(struct seq_file *m, struct dentry *root);
+static int afs_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_description afs_fs_parameters;
struct file_system_type afs_fs_type = {
- .owner = THIS_MODULE,
- .name = "afs",
- .mount = afs_mount,
- .kill_sb = afs_kill_super,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "afs",
+ .init_fs_context = afs_init_fs_context,
+ .parameters = &afs_fs_parameters,
+ .kill_sb = afs_kill_super,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("afs");
@@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = {
static struct kmem_cache *afs_inode_cachep;
static atomic_t afs_count_active_inodes;
-enum {
- afs_no_opt,
- afs_opt_cell,
- afs_opt_dyn,
- afs_opt_rwpath,
- afs_opt_vol,
- afs_opt_autocell,
+enum afs_param {
+ Opt_autocell,
+ Opt_dyn,
+ Opt_source,
};
-static const match_table_t afs_options_list = {
- { afs_opt_cell, "cell=%s" },
- { afs_opt_dyn, "dyn" },
- { afs_opt_rwpath, "rwpath" },
- { afs_opt_vol, "vol=%s" },
- { afs_opt_autocell, "autocell" },
- { afs_no_opt, NULL },
+static const struct fs_parameter_spec afs_param_specs[] = {
+ fsparam_flag ("autocell", Opt_autocell),
+ fsparam_flag ("dyn", Opt_dyn),
+ fsparam_string("source", Opt_source),
+ {}
+};
+
+static const struct fs_parameter_description afs_fs_parameters = {
+ .name = "kAFS",
+ .specs = afs_param_specs,
};
/*
@@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
}
/*
- * parse the mount options
- * - this function has been shamelessly adapted from the ext3 fs which
- * shamelessly adapted it from the msdos fs
- */
-static int afs_parse_options(struct afs_mount_params *params,
- char *options, const char **devname)
-{
- struct afs_cell *cell;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int token;
-
- _enter("%s", options);
-
- options[PAGE_SIZE - 1] = 0;
-
- while ((p = strsep(&options, ","))) {
- if (!*p)
- continue;
-
- token = match_token(p, afs_options_list, args);
- switch (token) {
- case afs_opt_cell:
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(params->net,
- args[0].from,
- args[0].to - args[0].from);
- rcu_read_unlock();
- if (IS_ERR(cell))
- return PTR_ERR(cell);
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
- break;
-
- case afs_opt_rwpath:
- params->rwpath = true;
- break;
-
- case afs_opt_vol:
- *devname = args[0].from;
- break;
-
- case afs_opt_autocell:
- params->autocell = true;
- break;
-
- case afs_opt_dyn:
- params->dyn_root = true;
- break;
-
- default:
- printk(KERN_ERR "kAFS:"
- " Unknown or invalid mount option: '%s'\n", p);
- return -EINVAL;
- }
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
- * parse a device name to get cell name, volume name, volume type and R/W
- * selector
- * - this can be one of the following:
+ * Parse the source name to get cell name, volume name, volume type and R/W
+ * selector.
+ *
+ * This can be one of the following:
* "%[cell:]volume[.]" R/W volume
- * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
- * or R/W (rwpath=1) volume
+ * "#[cell:]volume[.]" R/O or R/W volume (R/O parent),
+ * or R/W (R/W parent) volume
* "%[cell:]volume.readonly" R/O volume
* "#[cell:]volume.readonly" R/O volume
* "%[cell:]volume.backup" Backup volume
* "#[cell:]volume.backup" Backup volume
*/
-static int afs_parse_device_name(struct afs_mount_params *params,
- const char *name)
+static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_cell *cell;
- const char *cellname, *suffix;
+ const char *cellname, *suffix, *name = param->string;
int cellnamesz;
_enter(",%s", name);
@@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params,
}
if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+ /* To use dynroot, we don't want to have to provide a source */
+ if (strcmp(name, "none") == 0) {
+ ctx->no_cell = true;
+ return 0;
+ }
printk(KERN_ERR "kAFS: unparsable volume name\n");
return -EINVAL;
}
/* determine the type of volume we're looking for */
- params->type = AFSVL_ROVOL;
- params->force = false;
- if (params->rwpath || name[0] == '%') {
- params->type = AFSVL_RWVOL;
- params->force = true;
+ if (name[0] == '%') {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
name++;
/* split the cell name out if there is one */
- params->volname = strchr(name, ':');
- if (params->volname) {
+ ctx->volname = strchr(name, ':');
+ if (ctx->volname) {
cellname = name;
- cellnamesz = params->volname - name;
- params->volname++;
+ cellnamesz = ctx->volname - name;
+ ctx->volname++;
} else {
- params->volname = name;
+ ctx->volname = name;
cellname = NULL;
cellnamesz = 0;
}
/* the volume type is further affected by a possible suffix */
- suffix = strrchr(params->volname, '.');
+ suffix = strrchr(ctx->volname, '.');
if (suffix) {
if (strcmp(suffix, ".readonly") == 0) {
- params->type = AFSVL_ROVOL;
- params->force = true;
+ ctx->type = AFSVL_ROVOL;
+ ctx->force = true;
} else if (strcmp(suffix, ".backup") == 0) {
- params->type = AFSVL_BACKVOL;
- params->force = true;
+ ctx->type = AFSVL_BACKVOL;
+ ctx->force = true;
} else if (suffix[1] == 0) {
} else {
suffix = NULL;
}
}
- params->volnamesz = suffix ?
- suffix - params->volname : strlen(params->volname);
+ ctx->volnamesz = suffix ?
+ suffix - ctx->volname : strlen(ctx->volname);
_debug("cell %*.*s [%p]",
- cellnamesz, cellnamesz, cellname ?: "", params->cell);
+ cellnamesz, cellnamesz, cellname ?: "", ctx->cell);
/* lookup the cell record */
- if (cellname || !params->cell) {
- cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+ if (cellname) {
+ cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
NULL, false);
if (IS_ERR(cell)) {
- printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+ pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = cell;
}
_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
- params->cell->name, params->cell,
- params->volnamesz, params->volnamesz, params->volname,
- suffix ?: "-", params->type, params->force ? " FORCE" : "");
+ ctx->cell->name, ctx->cell,
+ ctx->volnamesz, ctx->volnamesz, ctx->volname,
+ suffix ?: "-", ctx->type, ctx->force ? " FORCE" : "");
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct afs_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, &afs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_source:
+ return afs_parse_source(fc, param);
+
+ case Opt_autocell:
+ ctx->autocell = true;
+ break;
+
+ case Opt_dyn:
+ ctx->dyn_root = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate the options, get the cell key and look up the volume.
+ */
+static int afs_validate_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_volume *volume;
+ struct key *key;
+
+ if (!ctx->dyn_root) {
+ if (ctx->no_cell) {
+ pr_warn("kAFS: Can only specify source 'none' with -o dyn\n");
+ return -EINVAL;
+ }
+
+ if (!ctx->cell) {
+ pr_warn("kAFS: No cell specified\n");
+ return -EDESTADDRREQ;
+ }
+
+ /* We try to do the mount securely. */
+ key = afs_request_key(ctx->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ ctx->key = key;
+
+ if (ctx->volume) {
+ afs_put_volume(ctx->cell, ctx->volume);
+ ctx->volume = NULL;
+ }
+
+ volume = afs_create_volume(ctx);
+ if (IS_ERR(volume))
+ return PTR_ERR(volume);
+
+ ctx->volume = volume;
+ }
return 0;
}
@@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params,
/*
* check a superblock to see if it's the one we're looking for
*/
-static int afs_test_super(struct super_block *sb, void *data)
+static int afs_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->volume &&
- as->volume->vid == as1->volume->vid &&
+ as->volume->vid == ctx->volume->vid &&
!as->dyn_root);
}
-static int afs_dynroot_test_super(struct super_block *sb, void *data)
+static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->dyn_root);
}
-static int afs_set_super(struct super_block *sb, void *data)
+static int afs_set_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as = data;
-
- sb->s_fs_info = as;
return set_anon_super(sb, NULL);
}
/*
* fill in the superblock
*/
-static int afs_fill_super(struct super_block *sb,
- struct afs_mount_params *params)
+static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
{
struct afs_super_info *as = AFS_FS_S(sb);
struct afs_fid fid;
@@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb,
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb,
fid.vnode = 1;
fid.vnode_hi = 0;
fid.unique = 1;
- inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
+ inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL);
}
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (params->autocell || params->dyn_root)
+ if (ctx->autocell || as->dyn_root)
set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
ret = -ENOMEM;
@@ -443,17 +458,20 @@ error:
return ret;
}
-static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as;
as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
if (as) {
- as->net_ns = get_net(params->net_ns);
- if (params->dyn_root)
+ as->net_ns = get_net(fc->net_ns);
+ if (ctx->dyn_root) {
as->dyn_root = true;
- else
- as->cell = afs_get_cell(params->cell);
+ } else {
+ as->cell = afs_get_cell(ctx->cell);
+ as->volume = __afs_get_volume(ctx->volume);
+ }
}
return as;
}
@@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb)
if (as->dyn_root)
afs_dynroot_depopulate(sb);
-
+
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb)
}
/*
- * get an AFS superblock
+ * Get an AFS superblock and root directory.
*/
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *options)
+static int afs_get_tree(struct fs_context *fc)
{
- struct afs_mount_params params;
+ struct afs_fs_context *ctx = fc->fs_private;
struct super_block *sb;
- struct afs_volume *candidate;
- struct key *key;
struct afs_super_info *as;
int ret;
- _enter(",,%s,%p", dev_name, options);
-
- memset(&params, 0, sizeof(params));
-
- ret = -EINVAL;
- if (current->nsproxy->net_ns != &init_net)
+ ret = afs_validate_fc(fc);
+ if (ret)
goto error;
- params.net_ns = current->nsproxy->net_ns;
- params.net = afs_net(params.net_ns);
-
- /* parse the options and device name */
- if (options) {
- ret = afs_parse_options(&params, options, &dev_name);
- if (ret < 0)
- goto error;
- }
-
- if (!params.dyn_root) {
- ret = afs_parse_device_name(&params, dev_name);
- if (ret < 0)
- goto error;
- /* try and do the mount securely */
- key = afs_request_key(params.cell);
- if (IS_ERR(key)) {
- _leave(" = %ld [key]", PTR_ERR(key));
- ret = PTR_ERR(key);
- goto error;
- }
- params.key = key;
- }
+ _enter("");
/* allocate a superblock info record */
ret = -ENOMEM;
- as = afs_alloc_sbi(&params);
+ as = afs_alloc_sbi(fc);
if (!as)
- goto error_key;
-
- if (!params.dyn_root) {
- /* Assume we're going to need a volume record; at the very
- * least we can use it to update the volume record if we have
- * one already. This checks that the volume exists within the
- * cell.
- */
- candidate = afs_create_volume(&params);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_as;
- }
-
- as->volume = candidate;
- }
+ goto error;
+ fc->s_fs_info = as;
/* allocate a deviceless superblock */
- sb = sget(fs_type,
- as->dyn_root ? afs_dynroot_test_super : afs_test_super,
- afs_set_super, flags, as);
+ sb = sget_fc(fc,
+ as->dyn_root ? afs_dynroot_test_super : afs_test_super,
+ afs_set_super);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- goto error_as;
+ goto error;
}
if (!sb->s_root) {
/* initial superblock/root creation */
_debug("create");
- ret = afs_fill_super(sb, &params);
+ ret = afs_fill_super(sb, ctx);
if (ret < 0)
goto error_sb;
- as = NULL;
sb->s_flags |= SB_ACTIVE;
} else {
_debug("reuse");
ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
- afs_destroy_sbi(as);
- as = NULL;
}
- afs_put_cell(params.net, params.cell);
- key_put(params.key);
+ fc->root = dget(sb->s_root);
_leave(" = 0 [%p]", sb);
- return dget(sb->s_root);
+ return 0;
error_sb:
deactivate_locked_super(sb);
- goto error_key;
-error_as:
- afs_destroy_sbi(as);
-error_key:
- key_put(params.key);
error:
- afs_put_cell(params.net, params.cell);
_leave(" = %d", ret);
- return ERR_PTR(ret);
+ return ret;
+}
+
+static void afs_free_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+
+ afs_destroy_sbi(fc->s_fs_info);
+ afs_put_volume(ctx->cell, ctx->volume);
+ afs_put_cell(ctx->net, ctx->cell);
+ key_put(ctx->key);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations afs_context_ops = {
+ .free = afs_free_fc,
+ .parse_param = afs_parse_param,
+ .get_tree = afs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int afs_init_fs_context(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx;
+ struct afs_cell *cell;
+
+ ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->type = AFSVL_ROVOL;
+ ctx->net = afs_net(fc->net_ns);
+
+ /* Default to the workstation cell. */
+ rcu_read_lock();
+ cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
+ rcu_read_unlock();
+ if (IS_ERR(cell))
+ cell = NULL;
+ ctx->cell = cell;
+
+ fc->fs_private = ctx;
+ fc->ops = &afs_context_ops;
+ return 0;
}
/*
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 00975ed3640f..f6eba2def0a1 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
/*
* Allocate a volume record and load it up from a vldb record.
*/
-static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
unsigned long type_mask)
{
@@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
-struct afs_volume *afs_create_volume(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_fs_context *params)
{
struct afs_vldb_entry *vldb;
struct afs_volume *volume;
diff --git a/fs/aio.c b/fs/aio.c
index aaaaf4d12c73..38b741aef0bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -167,9 +167,13 @@ struct kioctx {
unsigned id;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct fsync_iocb {
- struct work_struct work;
struct file *file;
+ struct work_struct work;
bool datasync;
};
@@ -183,8 +187,15 @@ struct poll_iocb {
struct work_struct work;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct aio_kiocb {
union {
+ struct file *ki_filp;
struct kiocb rw;
struct fsync_iocb fsync;
struct poll_iocb poll;
@@ -1060,6 +1071,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
{
if (refcount_read(&iocb->ki_refcnt) == 0 ||
refcount_dec_and_test(&iocb->ki_refcnt)) {
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
percpu_ref_put(&iocb->ki_ctx->reqs);
kmem_cache_free(kiocb_cachep, iocb);
}
@@ -1424,7 +1437,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- fput(kiocb->ki_filp);
aio_complete(iocb, res, res2);
}
@@ -1432,9 +1444,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp))
- return -EBADF;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
@@ -1451,7 +1460,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
- goto out_fput;
+ return ret;
}
req->ki_ioprio = iocb->aio_reqprio;
@@ -1460,14 +1469,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- goto out_fput;
+ return ret;
req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
return 0;
-
-out_fput:
- fput(req->ki_filp);
- return ret;
}
static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
@@ -1521,24 +1526,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
if (ret)
return ret;
file = req->ki_filp;
-
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
+ return -EBADF;
ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1555,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
file = req->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
@@ -1582,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1594,7 +1589,6 @@ static void aio_fsync_work(struct work_struct *work)
int ret;
ret = vfs_fsync(req->file, req->datasync);
- fput(req->file);
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
@@ -1605,13 +1599,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
iocb->aio_rw_flags))
return -EINVAL;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
- if (unlikely(!req->file->f_op->fsync)) {
- fput(req->file);
+ if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
- }
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
@@ -1621,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
{
- struct file *file = iocb->poll.file;
-
aio_complete(iocb, mangle_poll(mask), 0);
- fput(file);
}
static void aio_poll_complete_work(struct work_struct *work)
@@ -1680,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
req->woken = true;
@@ -1688,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (!(mask & req->events))
return 0;
- /* try to complete the iocb inline if we can: */
- if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+ * call this function with IRQs disabled and because IRQs
+ * have to be disabled before ctx_lock is obtained.
+ */
+ if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
list_del(&iocb->ki_list);
- spin_unlock(&iocb->ki_ctx->ctx_lock);
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
list_del_init(&req->wait.entry);
aio_poll_complete(iocb, mask);
@@ -1743,9 +1735,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_WORK(&req->work, aio_poll_complete_work);
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
req->head = NULL;
req->woken = false;
@@ -1788,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
spin_unlock_irq(&ctx->ctx_lock);
out:
- if (unlikely(apt.error)) {
- fput(req->file);
+ if (unlikely(apt.error))
return apt.error;
- }
if (mask)
aio_poll_complete(aiocb, mask);
@@ -1829,6 +1816,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
if (unlikely(!req))
goto out_put_reqs_available;
+ req->ki_filp = fget(iocb->aio_fildes);
+ ret = -EBADF;
+ if (unlikely(!req->ki_filp))
+ goto out_put_req;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -2199,11 +2191,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct old_timespec32 __user *, timeout)
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+ __s32, min_nr,
+ __s32, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_CATATONIC 0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
struct autofs_sb_info {
u32 magic;
@@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 078992eee299..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
- seq_printf(m, ",strictexpire");
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
{Opt_direct, "direct"},
{Opt_offset, "offset"},
{Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
@@ -206,6 +210,9 @@ static int parse_options(char *options,
case Opt_strictexpire:
sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
+ break;
default:
return 1;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca9725f18e00..1fefd87eb4b4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -29,97 +29,14 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- void __user *dump_start;
- int dump_size;
- struct user dump;
-#ifdef __alpha__
-# define START_DATA(u) ((void __user *)u.start_data)
-#else
-# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
- u.start_code))
-#endif
-# define START_STACK(u) ((void __user *)u.start_stack)
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
- dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->siginfo->si_signo;
- aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
- if we wrote the stack, but not the data area. */
- if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
-/* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
-/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
-/* now we start writing out the user space info */
- set_fs(USER_DS);
-/* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-/* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..e996174cbfc0 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,13 +14,30 @@
#include <linux/err.h>
#include <linux/fs.h>
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (!spacetab(*first))
+ return first;
+ return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (spacetab(*first) || !*first)
+ return first;
+ return NULL;
+}
+
static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
- char *cp;
+ char *cp, *buf_end;
struct file *file;
int retval;
+ /* Not ours to exec if we don't start with "#!". */
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -33,18 +50,40 @@ static int load_script(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
return -ENOENT;
- /*
- * This section does the #! interpretation.
- * Sorta complicated, but hopefully it will work. -TYT
- */
-
+ /* Release since we are not mapping a binary into memory. */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ /*
+ * This section handles parsing the #! line into separate
+ * interpreter path and argument strings. We must be careful
+ * because bprm->buf is not yet guaranteed to be NUL-terminated
+ * (though the buffer will have trailing NUL padding when the
+ * file size was smaller than the buffer size).
+ *
+ * We do not want to exec a truncated interpreter path, so either
+ * we find a newline (which indicates nothing is truncated), or
+ * we find a space/tab/NUL after the interpreter path (which
+ * itself may be preceded by spaces/tabs). Truncating the
+ * arguments is fine: the interpreter can re-read the script to
+ * parse them on its own.
+ */
+ buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+ cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+ if (!cp) {
+ cp = next_non_spacetab(bprm->buf + 2, buf_end);
+ if (!cp)
+ return -ENOEXEC; /* Entire buf is spaces/tabs */
+ /*
+ * If there is no later space/tab/NUL we must assume the
+ * interpreter path is truncated.
+ */
+ if (!next_terminator(cp, buf_end))
+ return -ENOEXEC;
+ cp = buf_end;
+ }
+ /* NUL-terminate the buffer and any trailing spaces/tabs. */
*cp = '\0';
while (cp > bprm->buf) {
cp--;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 58a4c1217fa8..e9faa52bb489 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
+ struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
task_io_account_write(ret);
}
if (iocb->ki_flags & IOCB_HIPRI)
- bio.bi_opf |= REQ_HIPRI;
+ bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
}
__set_current_state(TASK_RUNNING);
- bio_for_each_segment_all(bvec, &bio, i) {
+ bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@@ -293,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -329,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
@@ -406,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
- if (iocb->ki_flags & IOCB_HIPRI)
- bio->bi_opf |= REQ_HIPRI;
+ bool polled = false;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio_set_polled(bio, iocb);
+ polled = true;
+ }
qc = submit_bio(bio);
+
+ if (polled)
+ WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
@@ -2076,6 +2093,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+ .iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
}
if (acl) {
+ unsigned int nofs_flag;
+
size = posix_acl_xattr_size(acl->a_count);
+ /*
+ * We're holding a transaction handle, so use a NOFS memory
+ * allocation context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..122cb97c7909 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
}
if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+ ret->current_active, name);
else
- ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+ ret->current_active, name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78556447e1d5..11459fe84a29 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -712,7 +712,7 @@ out:
* read tree blocks and add keys where required.
*/
static int add_missing_keys(struct btrfs_fs_info *fs_info,
- struct preftrees *preftrees)
+ struct preftrees *preftrees, bool lock)
{
struct prelim_ref *ref;
struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- btrfs_tree_read_lock(eb);
+ if (lock)
+ btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
else
btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
- btrfs_tree_read_unlock(eb);
+ if (lock)
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
cond_resched();
@@ -1227,7 +1229,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees);
+ ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
@@ -1288,11 +1290,15 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+ if (!path->skip_locking) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_read(eb);
+ }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 548057630b69..4f2a8ae0aa42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,13 +162,14 @@ csum_failed:
} else {
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
@@ -730,6 +731,28 @@ struct heuristic_ws {
struct list_head list;
};
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -742,7 +765,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
@@ -769,65 +792,59 @@ fail:
return ERR_PTR(-ENOMEM);
}
-struct workspaces_list {
- struct list_head idle_ws;
- spinlock_t ws_lock;
- /* Number of free workspaces */
- int free_ws;
- /* Total number of allocated workspaces */
- atomic_t total_ws;
- /* Waiters for a free workspace */
- wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .init_workspace_manager = heuristic_init_workspace_manager,
+ .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ .get_workspace = heuristic_get_workspace,
+ .put_workspace = heuristic_put_workspace,
+ .alloc_workspace = alloc_heuristic_ws,
+ .free_workspace = free_heuristic_ws,
};
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
- int i;
- INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ wsm->ops = ops;
- workspace = alloc_heuristic_ws();
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
- "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
- atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- btrfs_heuristic_ws.free_ws = 1;
- list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
}
+}
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+ struct list_head *ws;
- /*
- * Preallocate one workspace for each compression type so
- * we can guarantee forward progress in the worst case
- */
- workspace = btrfs_compress_op[i]->alloc_workspace();
- if (IS_ERR(workspace)) {
- pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- } else {
- atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- btrfs_comp_ws[i].free_ws = 1;
- list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- }
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ wsman->ops->free_workspace(ws);
+ atomic_dec(&wsman->total_ws);
}
}
@@ -837,11 +854,11 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
- int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
@@ -849,19 +866,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
@@ -892,10 +901,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- if (heuristic)
- workspace = alloc_heuristic_ws();
- else
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -926,85 +932,47 @@ again:
return workspace;
}
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
{
- return __find_workspace(type, false);
+ return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void __free_workspace(int type, struct list_head *workspace,
- bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
- int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
- list_add(workspace, idle_ws);
+ list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
- if (heuristic)
- free_heuristic_ws(workspace);
- else
- btrfs_compress_op[idx]->free_workspace(workspace);
+ wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
{
- return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct list_head *workspace;
- int i;
-
- while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- workspace = btrfs_heuristic_ws.idle_ws.next;
- list_del(workspace);
- free_heuristic_ws(workspace);
- atomic_dec(&btrfs_heuristic_ws.total_ws);
- }
-
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- workspace = btrfs_comp_ws[i].idle_ws.next;
- list_del(workspace);
- btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].total_ws);
- }
- }
+ return btrfs_compress_op[type]->put_workspace(ws);
}
/*
@@ -1036,18 +1004,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out)
{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
- int type = type_level & 0xF;
-
- workspace = find_workspace(type);
- btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ workspace = get_workspace(type, level);
+ ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
- free_workspace(type, workspace);
+ put_workspace(type, workspace);
return ret;
}
@@ -1071,9 +1038,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int ret;
int type = cb->compress_type;
- workspace = find_workspace(type);
- ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- free_workspace(type, workspace);
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
return ret;
}
@@ -1089,19 +1056,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
struct list_head *workspace;
int ret;
- workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
+ put_workspace(type, workspace);
- free_workspace(type, workspace);
return ret;
}
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->init_workspace_manager();
+}
+
void __cold btrfs_exit_compress(void)
{
- free_workspaces();
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
@@ -1512,7 +1489,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = __find_workspace(0, true);
+ struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1581,18 +1558,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
}
out:
- __free_workspace(0, ws_list, true);
+ put_workspace(0, ws_list);
return ret;
}
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
- if (strncmp(str, "zlib", 4) != 0)
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
return 0;
- /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- return str[5] - '0';
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_op[type]->set_level(level);
- return BTRFS_ZLIB_DEFAULT_LEVEL;
+ return level;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf20..9976fe0f7526 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -64,6 +64,16 @@ struct compressed_bio {
u32 sums;
};
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+ return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+ return ((type_level & 0xF0) >> 4);
+}
+
void __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_TYPES = 3,
};
+struct workspace_manager {
+ const struct btrfs_compress_op *ops;
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
struct btrfs_compress_op {
- struct list_head *(*alloc_workspace)(void);
+ void (*init_workspace_manager)(void);
+
+ void (*cleanup_workspace_manager)(void);
+
+ struct list_head *(*get_workspace)(unsigned int level);
+
+ void (*put_workspace)(struct list_head *ws);
+
+ struct list_head *(*alloc_workspace)(unsigned int level);
void (*free_workspace)(struct list_head *workspace);
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
unsigned long start_byte,
size_t srclen, size_t destlen);
- void (*set_level)(struct list_head *ws, unsigned int type);
+ /*
+ * This bounds the level set by the user to be within range of a
+ * particular compression type. It returns the level that will be used
+ * if the level is out of bounds or the default if 0 is passed in.
+ */
+ unsigned int (*set_level)(unsigned int level);
};
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5a6c39b44c84..324df36d28bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -13,6 +13,7 @@
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
+#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
- btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_READ_LOCK)
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
}
}
@@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
return eb;
btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
@@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock_blocking(eb_root);
free_extent_buffer(eb_root);
@@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
- btrfs_set_lock_blocking(parent);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(parent);
+ btrfs_set_lock_blocking_write(buf);
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
@@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking_write(parent);
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
@@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking(cur);
+ btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking(child);
+ btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
if (ret) {
btrfs_tree_unlock(child);
@@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left);
if (wret) {
@@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right);
if (wret) {
@@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2529,26 +2544,6 @@ done:
return ret;
}
-static void key_search_validate(struct extent_buffer *b,
- const struct btrfs_key *key,
- int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
- struct btrfs_disk_key disk_key;
-
- btrfs_cpu_key_to_disk(&disk_key, key);
-
- if (level == 0)
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_leaf, items[0].key),
- sizeof(disk_key)));
- else
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_node, ptrs[0].key),
- sizeof(disk_key)));
-#endif
-}
-
static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
@@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
return *prev_cmp;
}
- key_search_validate(b, key, level);
*slot = 0;
return 0;
@@ -3005,6 +2999,8 @@ again:
*/
prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
if (level != 0) {
int dec = 0;
@@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
@@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
@@ -5156,6 +5152,10 @@ again:
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, level, &slot);
+ if (sret < 0) {
+ ret = sret;
+ goto out;
+ }
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7a2a2621f0d9..b3642367a595 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -934,7 +934,8 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct mutex cleaner_delayed_iput_mutex;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1074,10 +1075,13 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
- int scrub_workers_refcnt;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_nocow_workers;
struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1199,6 +1203,26 @@ enum {
BTRFS_ROOT_MULTI_LOG_TASKS,
BTRFS_ROOT_DIRTY,
BTRFS_ROOT_DELETING,
+
+ /*
+ * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+ *
+ * Set for the subvolume tree owning the reloc tree.
+ */
+ BTRFS_ROOT_DEAD_RELOC_TREE,
+ /* Mark dead root stored on device whose cleanup needs to be resumed */
+ BTRFS_ROOT_DEAD_TREE,
+};
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
};
/*
@@ -1312,6 +1336,14 @@ struct btrfs_root {
u64 nr_ordered_extents;
/*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
@@ -1328,6 +1360,9 @@ struct btrfs_root {
/* Number of active swapfiles */
atomic_t nr_swapfiles;
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -2775,7 +2810,8 @@ enum btrfs_flush_state {
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
ALLOC_CHUNK = 7,
- COMMIT_TRANS = 8,
+ ALLOC_CHUNK_FORCE = 8,
+ COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -3181,8 +3217,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start,
- u64 len, int create);
+ u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
@@ -3254,6 +3289,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@@ -3261,7 +3297,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -3415,31 +3451,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -3490,21 +3512,18 @@ do { \
rcu_read_unlock(); \
} while (0)
-#ifdef CONFIG_BTRFS_ASSERT
-
__cold
static inline void assfail(const char *expr, const char *file, int line)
{
- pr_err("assertion failed: %s, file: %s, line: %d\n",
- expr, file, line);
- BUG();
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ pr_err("assertion failed: %s, file: %s, line: %d\n",
+ expr, file, line);
+ BUG();
+ }
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#else
-#define ASSERT(expr) ((void)0)
-#endif
/*
* Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cad36c99a483..7d2a413df90d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
- head_ref->qgroup_reserved = 0;
- head_ref->qgroup_ref_root = 0;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) {
- head_ref->qgroup_ref_root = ref_root;
- head_ref->qgroup_reserved = reserved;
+ qrecord->data_rsv = reserved;
+ qrecord->data_rsv_refroot = ref_root;
}
-
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(qrecord && head_ref->qgroup_ref_root
- && head_ref->qgroup_reserved
- && existing->qgroup_ref_root
- && existing->qgroup_reserved);
update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d2af974f68a1..70606da440aa 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
- * For qgroup reserved space freeing.
- *
- * ref_root and reserved will be recorded after
- * BTRFS_ADD_DELAYED_EXTENT is called.
- * And will be used to free reserved qgroup space at
- * run_delayed_refs() time.
- */
- u64 qgroup_ref_root;
- u64 qgroup_reserved;
-
- /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8750c835f535..ee193c5222b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
- NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info,
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+ src_devid, NULL, NULL, true);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
+ up_write(&dev_replace->rwsem);
result = -EINVAL;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a2a2a951705..6fe9197f6ee4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (need_lock) {
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -832,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
@@ -1120,7 +1122,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1175,6 +1177,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
@@ -1218,6 +1221,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1258,10 +1262,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
+ unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1707,9 +1718,7 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -2101,7 +2110,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2666,7 +2675,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2688,6 +2696,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2765,6 +2774,7 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
@@ -2948,7 +2958,7 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -4238,16 +4248,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
- if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->refs);
- spin_unlock(&delayed_refs->lock);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref_head(head);
- spin_lock(&delayed_refs->lock);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
- }
+
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4263,12 +4266,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
- atomic_dec(&delayed_refs->num_entries);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d81035b7ea7d..1d49694e6ae3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
}
}
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -3013,8 +3010,7 @@ again:
}
if (run_all) {
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4276,14 @@ commit_trans:
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
- * more space is released.
+ * more space is released. We don't need to
+ * explicitly run the delayed iputs here because
+ * the commit_transaction would have woken up
+ * the cleaner.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ ret = btrfs_wait_on_delayed_iputs(fs_info);
+ if (ret)
+ return ret;
goto again;
} else {
btrfs_end_transaction(trans);
@@ -4396,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4404,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
return 1;
/*
- * We need to take into account the global rsv because for all intents
- * and purposes it's used space. Don't worry about locking the
- * global_rsv, it doesn't change except when the transaction commits.
- */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
- bytes_used += calc_global_rsv_need_space(global_rsv);
-
- /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
@@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 max_reclaim;
+ u64 async_pages;
u64 items;
long time_left;
unsigned long nr_pages;
@@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while (delalloc_bytes && loops < 3) {
- max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_SHIFT;
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
/*
- * We need to wait for the async pages to actually start before
- * we do anything.
+ * We need to wait for the compressed pages to start before
+ * we continue.
*/
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
- if (!max_reclaim)
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
goto skip_async;
- if (max_reclaim <= nr_pages)
- max_reclaim = 0;
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
else
- max_reclaim -= nr_pages;
+ async_pages -= nr_pages;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
- (int)max_reclaim);
+ (int)async_pages);
skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
@@ -4808,6 +4810,7 @@ skip_async:
}
struct reserve_ticket {
+ u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
@@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (!bytes_needed)
return 0;
- /* See if there is enough pinned space to make this reservation */
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * See if there is enough pinned space to make this reservation, or if
+ * we have block groups that are going to be freed, allowing us to
+ * possibly do a chunk allocation the next loop through.
+ */
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+ __percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
/*
@@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
* this reservation.
*/
if (space_info != delayed_rsv->space_info)
- return -ENOSPC;
+ goto enospc;
spin_lock(&delayed_rsv->lock);
reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
- return -ENOSPC;
- }
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+ goto enospc;
commit:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return -ENOSPC;
-
return btrfs_commit_transaction(trans);
+enospc:
+ btrfs_end_transaction(trans);
+ return -ENOSPC;
}
/*
@@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
ret = do_chunk_alloc(trans,
btrfs_metadata_alloc_profile(fs_info),
- CHUNK_ALLOC_NO_FORCE);
+ (state == ALLOC_CHUNK) ?
+ CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
@@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* bunch of pinned space, so make sure we run the iputs before
* we do our pinned bytes check below.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ btrfs_wait_on_delayed_iputs(fs_info);
ret = may_commit_transaction(fs_info, space_info);
break;
@@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
{
struct reserve_ticket *ticket;
@@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
list_del_init(&ticket->list);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
+ if (ticket->bytes != ticket->orig_bytes)
+ return true;
}
+ return false;
}
/*
@@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
- wake_all_tickets(&space_info->tickets);
- space_info->flush = 0;
+ if (wake_all_tickets(&space_info->tickets)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
@@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
u64 to_reclaim;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
+ int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
spin_unlock(&space_info->lock);
+ flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim,
+ priority_flush_states[flush_state]);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
return;
}
spin_unlock(&space_info->lock);
-
- /*
- * Priority flushers can't wait on delalloc without
- * deadlocking.
- */
- if (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT)
- flush_state = ALLOC_CHUNK;
- } while (flush_state < COMMIT_TRANS);
+ } while (flush_state < ARRAY_SIZE(priority_flush_states));
}
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket, u64 orig_bytes)
+ struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
+ u64 reclaim_bytes = 0;
int ret = 0;
spin_lock(&space_info->lock);
@@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
ret = ticket->error;
if (!list_empty(&ticket->list))
list_del_init(&ticket->list);
- if (ticket->bytes && ticket->bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket->bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- }
+ if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+ reclaim_bytes = ticket->orig_bytes - ticket->bytes;
spin_unlock(&space_info->lock);
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
return ret;
}
@@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket ticket;
u64 used;
+ u64 reclaim_bytes = 0;
int ret = 0;
ASSERT(orig_bytes);
@@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.orig_bytes = orig_bytes;
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
@@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(fs_info, space_info, &ticket,
- orig_bytes);
+ return wait_reserve_ticket(fs_info, space_info, &ticket);
ret = 0;
priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
- if (ticket.bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket.bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- num_bytes, 0);
-
- }
+ if (ticket.bytes < orig_bytes)
+ reclaim_bytes = orig_bytes - ticket.bytes;
list_del_init(&ticket.list);
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
+
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
ASSERT(list_empty(&ticket.list));
return ret;
}
@@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+ *metadata_bytes = 0;
+ *qgroup_bytes = 0;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ *metadata_bytes = block_rsv->size - block_rsv->reserved;
+ if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+ *qgroup_bytes = block_rsv->qgroup_rsv_size -
+ block_rsv->qgroup_rsv_reserved;
+ spin_unlock(&block_rsv->lock);
+}
+
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
@@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 num_bytes = 0;
- u64 qgroup_num_bytes = 0;
+ u64 num_bytes, last = 0;
+ u64 qgroup_num_bytes;
int ret = -ENOSPC;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size)
- num_bytes = block_rsv->size - block_rsv->reserved;
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
- block_rsv->qgroup_rsv_reserved;
- spin_unlock(&block_rsv->lock);
-
+ calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
if (num_bytes == 0)
return 0;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
- if (ret)
- return ret;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ do {
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
+ true);
+ if (ret)
+ return ret;
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ last = num_bytes;
+ /*
+ * If we are fragmented we can end up with a lot of
+ * outstanding extents which will make our size be much
+ * larger than our reserved amount.
+ *
+ * If the reservation happens here, it might be very
+ * big though not needed in the end, if the delalloc
+ * flushing happens.
+ *
+ * If this is the case try and do the reserve again.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ calc_refill_bytes(block_rsv, &num_bytes,
+ &qgroup_num_bytes);
+ if (num_bytes == 0)
+ return 0;
+ }
+ } while (ret && last != num_bytes);
+
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&block_rsv->lock);
- } else
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ }
return ret;
}
@@ -8066,6 +8125,15 @@ loop:
return ret;
}
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
@@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
info->bytes_readonly);
spin_unlock(&info->lock);
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
if (!dump_block_groups)
return;
@@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clean_tree_block(fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8690,6 +8764,8 @@ struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
+ struct btrfs_key drop_progress;
+ int drop_level;
int stage;
int level;
int shared_level;
@@ -8697,6 +8773,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int restarted;
};
#define DROP_REFERENCE 1
@@ -8860,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
+ * This is used to verify a ref exists for this root to deal with a bug where we
+ * would have a drop_progress key that hadn't been updated properly.
+ */
+static int check_ref_exists(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 parent,
+ int level)
+{
+ struct btrfs_path *path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
+ root->fs_info->nodesize, parent,
+ root->root_key.objectid, level, 0);
+ btrfs_free_path(path);
+ if (ret == -ENOENT)
+ return 0;
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -8917,7 +9021,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -8977,7 +9081,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
}
level--;
@@ -9014,6 +9118,23 @@ skip:
}
/*
+ * If we had a drop_progress we need to verify the refs are set
+ * as expected. If we find our ref then we know that from here
+ * on out everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, bytenr, parent,
+ level - 1);
+ if (ret < 0)
+ goto out_unlock;
+ if (ret == 0)
+ goto no_delete;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
* Reloc tree doesn't contribute to qgroup numbers, and we have
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
@@ -9028,13 +9149,23 @@ skip:
ret);
}
}
+
+ /*
+ * We need to update the next key in our walk control so we can
+ * update the drop_progress key accordingly. We don't care if
+ * find_next_key doesn't find a key because that means we're at
+ * the end and are going to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
goto out_unlock;
}
-
+no_delete:
*lookup_info = 1;
ret = 1;
@@ -9089,7 +9220,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9262,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(fs_info, eb);
@@ -9298,7 +9429,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
@@ -9328,7 +9459,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9351,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
wc->level = level;
wc->shared_level = -1;
wc->stage = DROP_REFERENCE;
@@ -9378,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (wc->stage == DROP_REFERENCE) {
- level = wc->level;
- btrfs_node_key(path->nodes[level],
- &root_item->drop_progress,
- path->slots[level]);
- root_item->drop_level = level;
+ wc->drop_level = wc->level;
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
+ &wc->drop_progress,
+ path->slots[wc->drop_level]);
}
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
+ &wc->drop_progress);
+ root_item->drop_level = wc->drop_level;
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
@@ -9595,6 +9729,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
+ u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@@ -9621,9 +9756,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo_used = btrfs_space_info_used(sinfo, true);
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
- min_allocable_bytes <= sinfo->total_bytes) {
+ if (sinfo_used + num_bytes + min_allocable_bytes <=
+ sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9768,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro",
+ cache->key.objectid);
+ btrfs_info(cache->fs_info,
+ "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+ sinfo_used, num_bytes, min_allocable_bytes);
+ dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
return ret;
}
@@ -10781,13 +10926,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->dirty_list)) {
- WARN_ON(1);
- }
- if (!list_empty(&block_group->io_list)) {
- WARN_ON(1);
- }
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..ca8b8e785cf3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -147,7 +147,39 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ blk_status_t ret = 0;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+ struct bio_vec bv;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
+ mp_bvec_last_segment(bvec, &bv);
+ start = page_offset(bv.bv_page) + bv.bv_offset;
+
+ bio->bi_private = NULL;
+
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(bio);
+
+ return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int ret;
+
+ ret = submit_one_bio(epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
int __init extent_io_init(void)
{
@@ -281,8 +313,8 @@ do_insert:
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
struct rb_node **next_ret,
+ struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
@@ -311,23 +343,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
if (parent_ret)
*parent_ret = prev;
- if (prev_ret) {
+ if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *prev_ret = prev;
+ *next_ret = prev;
prev = orig_prev;
}
- if (next_ret) {
+ if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *next_ret = prev;
+ *prev_ret = prev;
}
return NULL;
}
@@ -338,12 +370,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
- struct rb_node *prev = NULL;
+ struct rb_node *next= NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
- return prev;
+ return next;
return ret;
}
@@ -585,7 +617,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
- bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
@@ -850,7 +881,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
- bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -2350,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
- unsigned failed_bio_pages = bio_pages_all(failed_bio);
+ unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2422,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2493,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2692,28 +2724,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct page *page = bvec->bv_page;
- struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- start = page_offset(page) + bvec->bv_offset;
-
- bio->bi_private = NULL;
-
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
- else
- btrfsic_submit_bio(bio);
-
- return blk_status_to_errno(ret);
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
@@ -2985,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->orig_start)
+ *prev_em_start != em->start)
force_bio_submit = true;
if (prev_em_start)
- *prev_em_start = em->orig_start;
+ *prev_em_start = em->start;
free_extent_map(em);
em = NULL;
@@ -3634,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
@@ -4007,17 +4018,6 @@ retry:
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd)
-{
- if (epd->bio) {
- int ret;
-
- ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
- epd->bio = NULL;
- }
-}
-
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -4259,8 +4259,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9673be3f3d1f..08749e0b9c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,17 +18,16 @@
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
#define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_FIRST_DELALLOC (1U << 12)
-#define EXTENT_NEED_WAIT (1U << 13)
-#define EXTENT_DAMAGED (1U << 14)
-#define EXTENT_NORESERVE (1U << 15)
-#define EXTENT_QGROUP_RESERVED (1U << 16)
-#define EXTENT_CLEAR_DATA_RESV (1U << 17)
-#define EXTENT_DELALLOC_NEW (1U << 18)
+#define EXTENT_NEED_WAIT (1U << 12)
+#define EXTENT_DAMAGED (1U << 13)
+#define EXTENT_NORESERVE (1U << 14)
+#define EXTENT_QGROUP_RESERVED (1U << 15)
+#define EXTENT_CLEAR_DATA_RESV (1U << 16)
+#define EXTENT_DELALLOC_NEW (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
/*
* flags for bio submission. The high bits indicate the compression
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a042a193c120..928f729c55ba 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
+ ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+ prev->block_start != EXTENT_MAP_DELALLOC);
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start == EXTENT_MAP_DELALLOC &&
- prev->block_start == EXTENT_MAP_DELALLOC) ||
(next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
next->block_start == extent_map_block_end(prev)))) {
return 1;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ef05a0121652..473f039fcd7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -9,6 +9,7 @@
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d38dc8c31533..34fe8a58b0e9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
&cached_state);
while (start < inode->i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
- start, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c349667c761..82fdda8ff5ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
- u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, isize, end + 1);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct inode *inode,
- struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
+ struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
+ /*
+ * ->inode could be NULL if async_cow_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
if (async_cow->inode)
- submit_compressed_extents(async_cow->inode, async_cow);
+ submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = igrab(inode);
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(inode);
+ async_cow->inode = inode;
async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
@@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
- struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
+ atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
@@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ ret = btrfs_block_rsv_refill(root, rsv,
+ rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
return ERR_PTR(-ENOSPC);
}
+ /*
+ * Evict can generate a large amount of delayed refs without
+ * having a way to add space back since we exhaust our temporary
+ * block rsv. We aren't allowed to do FLUSH_ALL in this case
+ * because we could deadlock with so many things in the flushing
+ * code, so we have to try and hold some extra space to
+ * compensate for our delayed ref generation. If we can't get
+ * that space then we need see if we can steal our minimum from
+ * the global reserve. We will be ratelimited by the amount of
+ * space we have for the delayed refs rsv, so we'll end up
+ * committing and trying again.
+ */
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret)
+ if (IS_ERR(trans) || !ret) {
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
return trans;
+ }
/*
* Try to steal from the global reserve if there is space for
@@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u32 found_type;
+ u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
@@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
- }
-
- if (ret != 0) {
+ } else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
@@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
if (found_key.objectid != objectid ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
@@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto next;
}
- found_type = btrfs_file_extent_type(leaf, item);
+ extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6840,9 +6888,9 @@ next:
if (ret < 0) {
err = ret;
goto out;
- }
- if (ret > 0)
+ } else if (ret > 0) {
goto not_found;
+ }
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6853,19 +6901,22 @@ next:
goto not_found;
if (start > found_key.offset)
goto next;
+
+ /* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
- goto not_found_em;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
@@ -6916,7 +6967,6 @@ not_found:
em->start = start;
em->orig_start = start;
em->len = len;
-not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
@@ -6946,19 +6996,17 @@ out:
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
- u64 range_start = start;
+ u64 delalloc_start = start;
u64 end;
- u64 found;
- u64 found_end;
+ u64 delalloc_len;
+ u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
@@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
- found = count_range_bits(&inode->io_tree, &range_start,
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
- found_end = range_start + found;
- if (found_end < range_start)
- found_end = (u64)-1;
+ delalloc_end = delalloc_start + delalloc_len;
+ if (delalloc_end < delalloc_start)
+ delalloc_end = (u64)-1;
/*
- * we didn't find anything useful, return
- * the original results from get_extent()
+ * We didn't find anything useful, return the original results from
+ * get_extent()
*/
- if (range_start > end || found_end <= start) {
+ if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
- /* adjust the range_start to make sure it doesn't
- * go backwards from the start they passed in
+ /*
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
+ * the start they passed in
*/
- range_start = max(start, range_start);
- found = found_end - range_start;
+ delalloc_start = max(start, delalloc_start);
+ delalloc_len = delalloc_end - delalloc_start;
- if (found > 0) {
- u64 hole_start = start;
- u64 hole_len = len;
+ if (delalloc_len > 0) {
+ u64 hole_start;
+ u64 hole_len;
+ const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
+ em->bdev = NULL;
+
+ ASSERT(hole_em);
/*
- * when btrfs_get_extent can't find anything it
- * returns one huge hole
+ * When btrfs_get_extent can't find anything it returns one
+ * huge hole
*
- * make sure what it found really fits our range, and
- * adjust to make sure it is based on the start from
- * the caller
+ * Make sure what it found really fits our range, and adjust to
+ * make sure it is based on the start from the caller
*/
- if (hole_em) {
- u64 calc_end = extent_map_end(hole_em);
-
- if (calc_end <= start || (hole_em->start > end)) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = calc_end - hole_start;
- }
+ if (hole_end <= start || hole_em->start > end) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = hole_end - hole_start;
}
- em->bdev = NULL;
- if (hole_em && range_start > hole_start) {
- /* our hole starts before our delalloc, so we
- * have to return just the parts of the hole
- * that go until the delalloc starts
+
+ if (hole_em && delalloc_start > hole_start) {
+ /*
+ * Our hole starts before our delalloc, so we have to
+ * return just the parts of the hole that go until the
+ * delalloc starts
*/
- em->len = min(hole_len,
- range_start - hole_start);
+ em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
- * don't adjust block start at all,
- * it is fixed at EXTENT_MAP_HOLE
+ * Don't adjust block start at all, it is fixed at
+ * EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
- em->start = range_start;
- em->len = found;
- em->orig_start = range_start;
+ /*
+ * Hole is out of passed range or it starts after
+ * delalloc range
+ */
+ em->start = delalloc_start;
+ em->len = delalloc_len;
+ em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = found;
+ em->block_len = delalloc_len;
}
} else {
return hole_em;
@@ -7777,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7788,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@@ -7867,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7880,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
@@ -9910,7 +9964,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9c8e1734429c..ec2d8919e7fb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
s_uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+ NULL, true);
if (!dev) {
ret = -ENODEV;
@@ -3206,21 +3207,6 @@ out:
return ret;
}
-static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
-{
- inode_unlock(inode1);
- inode_unlock(inode2);
-}
-
-static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
-{
- if (inode1 < inode2)
- swap(inode1, inode2);
-
- inode_lock_nested(inode1, I_MUTEX_PARENT);
- inode_lock_nested(inode2, I_MUTEX_CHILD);
-}
-
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
@@ -3241,32 +3227,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
- u64 len = olen;
-
- if (loff + len == src->i_size)
- len = ALIGN(src->i_size, bs) - loff;
- /*
- * For same inode case we don't want our length pushed out past i_size
- * as comparing that data range makes no sense.
- *
- * This effectively means we require aligned extents for the single
- * inode case, whereas the other cases allow an unaligned length so long
- * as it ends at i_size.
- */
- if (dst == src && len != olen)
- return -EINVAL;
/*
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
@@ -3278,21 +3249,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
- int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
u64 i, tail_len, chunk_count;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
- return -ETXTBSY;
-
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
- if (chunk_count == 0)
- num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@@ -3908,14 +3868,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
- return -ETXTBSY;
-
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
@@ -3989,7 +3941,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_lock(inode_in);
else
- btrfs_double_inode_lock(inode_in, inode_out);
+ lock_two_nondirectories(inode_in, inode_out);
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
/*
* Now that the inodes are locked, we need to start writeback ourselves
@@ -4039,7 +3998,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_unlock(inode_in);
else
- btrfs_double_inode_unlock(inode_in, inode_out);
+ unlock_two_nondirectories(inode_in, inode_out);
return ret;
}
@@ -4069,7 +4028,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (same_inode)
inode_unlock(src_inode);
else
- btrfs_double_inode_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(src_inode, dst_inode);
return ret < 0 ? ret : len;
}
@@ -4381,7 +4340,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4414,7 +4373,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4438,7 +4397,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
ret = btrfs_get_dev_stats(fs_info, sa);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4484,7 +4443,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
break;
}
- if (copy_to_user(arg, p, sizeof(*p)))
+ if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
@@ -4790,7 +4749,7 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if (arg) {
+ if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e5ef75..82b84e4daad1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,43 +14,58 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
/*
- * no lock is required. The lock owner may change if
- * we have a read lock, but it won't change to or away
- * from us. If we have the write lock, we are the owner
- * and it'll never change.
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
- if (rw == BTRFS_WRITE_LOCK) {
- if (atomic_read(&eb->blocking_writers) == 0) {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
- btrfs_assert_tree_locked(eb);
- atomic_inc(&eb->blocking_writers);
- write_unlock(&eb->lock);
- }
- } else if (rw == BTRFS_READ_LOCK) {
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
- read_unlock(&eb->lock);
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
+}
+
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
}
}
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
+}
+
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
-
- if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
- write_lock(&eb->lock);
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_writers))
- cond_wake_up_nomb(&eb->write_lock_wq);
- } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
- read_lock(&eb->lock);
- atomic_inc(&eb->spinning_readers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- }
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
}
/*
@@ -232,16 +237,9 @@ again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers)) {
+ if (atomic_read(&eb->blocking_readers) ||
+ atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
- wait_event(eb->read_lock_wq,
- atomic_read(&eb->blocking_readers) == 0);
- goto again;
- }
- if (atomic_read(&eb->blocking_writers)) {
- write_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
WARN_ON(atomic_read(&eb->spinning_writers));
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135def468e..595014f64830 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 90639140439f..579d53ae256f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -61,6 +61,28 @@ struct workspace {
struct list_head list;
};
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -485,11 +507,16 @@ out:
return ret;
}
-static void lzo_set_level(struct list_head *ws, unsigned int type)
+static unsigned int lzo_set_level(unsigned int level)
{
+ return 0;
}
const struct btrfs_compress_op btrfs_lzo_compress = {
+ .init_workspace_manager = lzo_init_workspace_manager,
+ .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+ .get_workspace = lzo_get_workspace,
+ .put_workspace = lzo_put_workspace,
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e473a998219..eb680b715dd6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
if (fs_info->quota_root)
goto out;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
/*
* initially create the quota tree
*/
@@ -1546,12 +1545,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ } else {
+ if (record->data_rsv && !entry->data_rsv) {
+ entry->data_rsv = record->data_rsv;
+ entry->data_rsv_refroot =
+ record->data_rsv_refroot;
+ }
return 1;
+ }
}
rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1602,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kmalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
@@ -1832,7 +1837,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
}
@@ -1973,7 +1978,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
need_cleanup = true;
}
@@ -2017,86 +2022,30 @@ out:
return ret;
}
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
- struct btrfs_key first_key;
- struct extent_buffer *src_eb = NULL;
- struct extent_buffer *dst_eb = NULL;
- bool trace_leaf = false;
- u64 child_gen;
- u64 child_bytenr;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- /* Check parameter order */
- if (btrfs_node_ptr_generation(src_parent, src_slot) >
- btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
- btrfs_node_ptr_generation(src_parent, src_slot),
- btrfs_node_ptr_generation(dst_parent, dst_slot));
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
- /*
- * Only trace leaf if we're relocating data block groups, this could
- * reduce tons of data extents tracing for meta/sys bg relocation.
- */
- if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
- trace_leaf = true;
- /* Read out real @src_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
- child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
- btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
- src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(src_parent) - 1, &first_key);
- if (IS_ERR(src_eb)) {
- ret = PTR_ERR(src_eb);
- goto out;
- }
-
- /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
- child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
- btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
- dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(dst_parent) - 1, &first_key);
- if (IS_ERR(dst_eb)) {
- ret = PTR_ERR(dst_eb);
- goto out;
- }
-
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
- ret = -EINVAL;
+ ret = -EIO;
goto out;
}
@@ -2106,14 +2055,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = -ENOMEM;
goto out;
}
-
/* For dst_path */
extent_buffer_get(dst_eb);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
- /* Do the generation-aware breadth-first search */
+ /* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
@@ -2121,8 +2069,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = 0;
out:
- free_extent_buffer(src_eb);
- free_extent_buffer(dst_eb);
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2153,7 @@ walk_down:
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2522,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
goto cleanup;
}
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2793,15 @@ out:
/*
* Two limits to commit transaction in advance.
*
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
-#define QGROUP_PERTRANS_RATIO 32
-#define QGROUP_PERTRANS_SIZE SZ_32M
+#define QGROUP_FREE_RATIO 32
+#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 limit;
+ u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2820,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- limit = qg->max_excl;
- else
- limit = qg->max_rfer;
- threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
- qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
- QGROUP_PERTRANS_RATIO;
- threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+ if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+ threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ } else {
+ free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+ threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ }
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
- if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+ if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
@@ -2959,7 +2910,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
@@ -3026,7 +2976,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3732,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..46ba7bd2961c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -38,6 +40,66 @@
*/
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
+
+ /*
+ * For qgroup reserved data space freeing.
+ *
+ * @data_rsv_refroot and @data_rsv will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * transaction commit time.
+ */
+ u32 data_rsv; /* reserved data space needs to be freed */
+ u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* Qgroup reservation types:
*
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
-
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
-{
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return;
- trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
- BTRFS_QGROUP_RSV_DATA);
-}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e74455eb42f9..1869ba8e5981 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index c3557c12656b..d09b6cdb785a 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
return -EIO;
}
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 272b287f8cf0..ddf028509931 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -162,6 +162,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (!root->reloc_root)
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+ !root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
- root->reloc_root = NULL;
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@@ -1773,7 +1777,7 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
BUG_ON(level < lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret < 0)
+ break;
if (ret && slot > 0)
slot--;
@@ -1852,7 +1858,7 @@ again:
slot, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
- * and tree block numbers, if current trans doesn't free
- * data reloc tree inode.
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
- parent, slot, path->nodes[level],
- path->slots[level], last_snapshot);
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
if (ret < 0)
break;
-
/*
* swap blocks in fs tree and reloc tree.
*/
@@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level,
}
/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ reloc_root_item->drop_level = 0;
+ btrfs_set_root_refs(reloc_root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_fs_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
+ int ret2;
+
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ btrfs_put_fs_root(root);
+ }
+ return ret;
+}
+
+/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (err == 0) {
- memset(&root_item->drop_progress, 0,
- sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
- btrfs_set_root_refs(root_item, 0);
- btrfs_update_reloc_root(trans, root);
- }
+ if (err == 0)
+ insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
-
- ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
- if (ret < 0) {
- if (list_empty(&reloc_root->root_list))
- list_add_tail(&reloc_root->root_list,
- &reloc_roots);
- goto out;
- }
}
if (found) {
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key,
upper->level, &slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
}
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -4079,6 +4134,9 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans);
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
@@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans);
+
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..893d12fbfda0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
- int len;
+ u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
- min_t(int, len, (int)sizeof(*item)));
+ min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
@@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
if (root) {
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state));
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
continue;
}
@@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
break;
}
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
}
btrfs_free_path(path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dcd36d7b849..a99588536c79 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
+ INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
- INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
- if (fs_info->scrub_workers_refcnt == 0) {
+ lockdep_assert_held(&fs_info->scrub_lock);
+
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
+ ASSERT(fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
+ ASSERT(fs_info->scrub_parity_workers == NULL);
fs_info->scrub_parity_workers =
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
+
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ } else {
+ refcount_inc(&fs_info->scrub_workers_refcnt);
}
- ++fs_info->scrub_workers_refcnt;
return 0;
fail_scrub_parity_workers:
@@ -3770,16 +3778,6 @@ fail_scrub_workers:
return -ENOMEM;
}
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
- if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_destroy_workqueue(fs_info->scrub_workers);
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
- btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
- }
- WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
@@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
@@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
+ if (!is_dev_replace)
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+ ret ? "not finished" : "finished", devid, ret);
+
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- scrub_workers_put(fs_info);
+ if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ }
mutex_unlock(&fs_info->scrub_lock);
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
scrub_put_ctx(sctx);
return ret;
@@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a3f122dd61f..120e4340792a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
- btrfs_compress_str2level(args[0].from);
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZLIB,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
- } else if (strcmp(args[0].from, "zstd") == 0) {
+ } else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZSTD,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
+ case BTRFS_IOC_FORGET_DEV:
+ ret = btrfs_forget_devices(vol->name);
+ break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4ec2b660d014..acdad6d658f5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
@@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
@@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking(old);
+ btrfs_set_lock_blocking_write(old);
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
@@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->delayed_refs.flushing = 1;
smp_wmb();
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987ab587d..5f9e2dd413af 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(root_node);
+ btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ac232b3d6d7e..f06454a55e00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
#define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
/*
* directory trouble cases
@@ -1330,6 +1331,67 @@ out:
return ret;
}
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct inode *dir, struct inode *inode, const char *name,
+ int namelen, u64 ref_index)
+{
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct inode *other_inode = NULL;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
+ btrfs_ino(BTRFS_I(dir)),
+ name, namelen, 0);
+ if (!dir_item) {
+ btrfs_release_path(path);
+ goto add_link;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ goto out;
+ }
+
+ /*
+ * Our inode's dentry collides with the dentry of another inode which is
+ * in the log but not yet processed since it has a higher inode number.
+ * So delete that other dentry.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+ btrfs_release_path(path);
+ other_inode = read_one_inode(root, key.objectid);
+ if (!other_inode) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
+ if (ret)
+ goto out;
+ /*
+ * If we dropped the link count to 0, bump it so that later the iput()
+ * on the inode will not free it. We will fixup the link count later.
+ */
+ if (other_inode->i_nlink == 0)
+ inc_nlink(other_inode);
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+add_link:
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
+out:
+ iput(other_inode);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ ret = add_link(trans, root, dir, inode, name, namelen,
+ ref_index);
if (ret)
goto out;
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
- u64 *other_ino)
+ u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
- ret = 1;
- *other_ino = di_key.objectid;
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
} else {
ret = -EAGAIN;
}
@@ -4801,6 +4869,144 @@ out:
return ret;
}
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ u64 ino, u64 parent)
+{
+ struct btrfs_ino_list *ino_elem;
+ LIST_HEAD(inode_list);
+ int ret = 0;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+
+ while (!list_empty(&inode_list)) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+ list);
+ ino = ino_elem->ino;
+ parent = ino_elem->parent;
+ list_del(&ino_elem->list);
+ kfree(ino_elem);
+ if (ret)
+ continue;
+
+ btrfs_release_path(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory.
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ key.objectid = parent;
+ inode = btrfs_iget(fs_info->sb, &key, root,
+ NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ } else {
+ ret = btrfs_log_inode(trans, root,
+ BTRFS_I(inode),
+ LOG_OTHER_INODE_ALL,
+ 0, LLONG_MAX, ctx);
+ iput(inode);
+ }
+ }
+ continue;
+ }
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ if (ret) {
+ iput(inode);
+ continue;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ iput(inode);
+ continue;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
+ BTRFS_I(inode), &other_ino,
+ &other_parent);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ break;
+ }
+ ino_elem->ino = other_ino;
+ ino_elem->parent = other_parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+ ret = 0;
+ }
+ path->slots[0]++;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
+ bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
- if (inode_only == LOG_OTHER_INODE) {
- inode_only = LOG_INODE_EXISTS;
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+ recursive_logging = true;
+ if (inode_only == LOG_OTHER_INODE)
+ inode_only = LOG_INODE_EXISTS;
+ else
+ inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@@ -4981,20 +5192,19 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid) {
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
+ u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
- &other_ino);
+ &other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- struct btrfs_key inode_key;
- struct inode *other_inode;
-
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5010,43 +5220,13 @@ again:
goto out_unlock;
}
ins_nr = 0;
- btrfs_release_path(path);
- inode_key.objectid = other_ino;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
- other_inode = btrfs_iget(fs_info->sb,
- &inode_key, root,
- NULL);
- /*
- * If the other inode that had a conflicting dir
- * entry was deleted in the current transaction,
- * we don't need to do more work nor fallback to
- * a transaction commit.
- */
- if (other_inode == ERR_PTR(-ENOENT)) {
- goto next_key;
- } else if (IS_ERR(other_inode)) {
- err = PTR_ERR(other_inode);
- goto out_unlock;
- }
- /*
- * We are safe logging the other inode without
- * acquiring its i_mutex as long as we log with
- * the LOG_INODE_EXISTS mode. We're safe against
- * concurrent renames of the other inode as well
- * because during a rename we pin the log and
- * update the log with the new name before we
- * unpin it.
- */
- err = btrfs_log_inode(trans, root,
- BTRFS_I(other_inode),
- LOG_OTHER_INODE, 0, LLONG_MAX,
- ctx);
- iput(other_inode);
+
+ err = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
if (err)
goto out_unlock;
- else
- goto next_key;
+ btrfs_release_path(path);
+ goto next_key;
}
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 15561926ab32..9024eee889b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
return dev;
}
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, const u8 *uuid)
-{
- struct btrfs_device *dev;
-
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- if (dev->devid == devid &&
- (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
- return dev;
- }
- }
- return NULL;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+ int found;
+
+ rcu_read_lock();
+ found = strcmp(rcu_str_deref(device->name), path);
+ rcu_read_unlock();
+
+ return found == 0;
+}
+
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
@@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
+ * Return: 0 for success or if @path is NULL.
+ * -EBUSY if @path is a mounted device.
+ * -ENOENT if @path does not match any device in the list.
*/
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
+ int ret = 0;
+
+ if (path)
+ ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
- mutex_lock(&fs_devices->device_list_mutex);
- if (fs_devices->opened) {
- mutex_unlock(&fs_devices->device_list_mutex);
- continue;
- }
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
- int not_found = 0;
-
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
-
- rcu_read_lock();
- if (path)
- not_found = strcmp(rcu_str_deref(device->name),
- path);
- rcu_read_unlock();
- if (not_found)
+ if (path && !device_path_matched(path, device))
continue;
+ if (fs_devices->opened) {
+ /* for an already deleted device return 0 */
+ if (path && ret != 0)
+ ret = -EBUSY;
+ break;
+ }
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
+ ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
+
+ return ret;
}
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
- device = find_device(fs_devices, devid,
- disk_super->dev_item.uuid);
+ device = btrfs_find_device(fs_devices, devid,
+ disk_super->dev_item.uuid, NULL, false);
/*
* If this disk has been pulled into an fs devices created by
@@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
- /* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
@@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
return 0;
}
+int btrfs_forget_devices(const char *path)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ mutex_unlock(&uuid_mutex);
+
+ return ret;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info, devid, dev_uuid,
- disk_super->metadata_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->metadata_uuid, true);
else
- device = btrfs_find_device(fs_info, devid,
- dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->fsid, true);
brelse(bh);
if (!device)
@@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
return device;
}
-static struct btrfs_device *btrfs_find_device_missing_or_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
-{
- struct btrfs_device *device = NULL;
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(tmp, devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &tmp->dev_state) && !tmp->bdev) {
- device = tmp;
- break;
- }
- }
-
- if (!device)
- return ERR_PTR(-ENOENT);
- } else {
- device = btrfs_find_device_by_path(fs_info, device_path);
- }
-
- return device;
-}
-
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
- struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
+ struct btrfs_fs_info *fs_info, u64 devid,
+ const char *device_path)
{
struct btrfs_device *device;
if (devid) {
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+ NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
- } else {
- if (!devpath || !devpath[0])
- return ERR_PTR(-EINVAL);
- device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
+ return device;
}
- return device;
+
+ if (!device_path || !device_path[0])
+ return ERR_PTR(-EINVAL);
+
+ if (strcmp(device_path, "missing") == 0) {
+ /* Find first missing device */
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) && !device->bdev)
+ return device;
+ }
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_find_device_by_path(fs_info, device_path);
}
/*
@@ -2563,7 +2556,8 @@ next_slot:
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
return BLK_STS_OK;
}
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid,
+ bool seed)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *cur_devices;
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
+ while (fs_devices) {
if (!fsid ||
- !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- device = find_device(cur_devices, devid, uuid);
- if (device)
- return device;
+ !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices,
+ dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
}
- cur_devices = cur_devices->seed;
+ if (seed)
+ fs_devices = fs_devices->seed;
+ else
+ return NULL;
}
return NULL;
}
@@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
}
if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
btrfs_err(fs_info,
@@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info, devid,
- uuid, NULL);
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+ devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+ true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
@@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = find_device(fs_info->fs_devices->seed, devid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+ NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ed806649a473..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid, bool seed);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e35bb3..b86b7ad6b900 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -27,6 +27,33 @@ struct workspace {
int level;
};
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+ struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ workspace->level = level;
+
+ return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -390,18 +418,19 @@ next:
return ret;
}
-static void zlib_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zlib_set_level(unsigned int level)
{
- struct workspace *workspace = list_entry(ws, struct workspace, list);
- unsigned level = (type & 0xF0) >> 4;
-
- if (level > 9)
- level = 9;
+ if (!level)
+ return BTRFS_ZLIB_DEFAULT_LEVEL;
- workspace->level = level > 0 ? level : 3;
+ return min_t(unsigned int, level, 9);
}
const struct btrfs_compress_op btrfs_zlib_compress = {
+ .init_workspace_manager = zlib_init_workspace_manager,
+ .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+ .get_workspace = zlib_get_workspace,
+ .put_workspace = zlib_put_workspace,
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59972f5..6b9e29d050f3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -6,25 +6,31 @@
*/
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/pagemap.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include "compression.h"
+#include "ctree.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+ size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
- src_len, 0);
+ ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,292 @@ struct workspace {
void *mem;
size_t size;
char *buf;
+ unsigned int level;
+ unsigned int req_level;
+ unsigned long last_used; /* jiffies */
struct list_head list;
+ struct list_head lru_list;
ZSTD_inBuffer in_buf;
ZSTD_outBuffer out_buf;
};
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru. Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up. This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee. A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level. Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+ const struct btrfs_compress_op *ops;
+ spinlock_t lock;
+ struct list_head lru_list;
+ struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+ unsigned long active_map;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+ return container_of(list, struct workspace, list);
+}
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+ unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+ struct list_head *pos, *next;
+
+ spin_lock(&wsm.lock);
+
+ if (list_empty(&wsm.lru_list)) {
+ spin_unlock(&wsm.lock);
+ return;
+ }
+
+ list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ struct workspace *victim = container_of(pos, struct workspace,
+ lru_list);
+ unsigned int level;
+
+ if (time_after(victim->last_used, reclaim_threshold))
+ break;
+
+ /* workspace is in use */
+ if (victim->req_level)
+ continue;
+
+ level = victim->level;
+ list_del(&victim->lru_list);
+ list_del(&victim->list);
+ wsm.ops->free_workspace(&victim->list);
+
+ if (list_empty(&wsm.idle_ws[level - 1]))
+ clear_bit(level - 1, &wsm.active_map);
+
+ }
+
+ if (!list_empty(&wsm.lru_list))
+ mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+ spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace. In order to reuse
+ * workspaces, this must be made a monotonic relationship. This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+ size_t max_size = 0;
+ unsigned int level;
+
+ for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ ZSTD_parameters params =
+ zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+ size_t level_size =
+ max_t(size_t,
+ ZSTD_CStreamWorkspaceBound(params.cParams),
+ ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+ max_size = max_t(size_t, max_size, level_size);
+ zstd_ws_mem_sizes[level - 1] = max_size;
+ }
+}
+
+static void zstd_init_workspace_manager(void)
+{
+ struct list_head *ws;
+ int i;
+
+ zstd_calc_ws_mem_sizes();
+
+ wsm.ops = &btrfs_zstd_compress;
+ spin_lock_init(&wsm.lock);
+ init_waitqueue_head(&wsm.wait);
+ timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+ INIT_LIST_HEAD(&wsm.lru_list);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+ ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ if (IS_ERR(ws)) {
+ pr_warn(
+ "BTRFS: cannot preallocate zstd compression workspace\n");
+ } else {
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+ list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ }
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+ struct workspace *workspace;
+ int i;
+
+ spin_lock(&wsm.lock);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&wsm.idle_ws[i])) {
+ workspace = container_of(wsm.idle_ws[i].next,
+ struct workspace, list);
+ list_del(&workspace->list);
+ list_del(&workspace->lru_list);
+ wsm.ops->free_workspace(&workspace->list);
+ }
+ }
+ spin_unlock(&wsm.lock);
+
+ del_timer_sync(&wsm.timer);
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level. This lets us utilize already allocated workspaces before
+ * allocating a new one. If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated. This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ struct workspace *workspace;
+ int i = level - 1;
+
+ spin_lock(&wsm.lock);
+ for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&wsm.idle_ws[i])) {
+ ws = wsm.idle_ws[i].next;
+ workspace = list_to_workspace(ws);
+ list_del_init(ws);
+ /* keep its place if it's a lower level using this */
+ workspace->req_level = level;
+ if (level == workspace->level)
+ list_del(&workspace->lru_list);
+ if (list_empty(&wsm.idle_ws[i]))
+ clear_bit(i, &wsm.active_map);
+ spin_unlock(&wsm.lock);
+ return ws;
+ }
+ }
+ spin_unlock(&wsm.lock);
+
+ return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used. Therefore, we begin
+ * scanning from 1. We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace. If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ unsigned int nofs_flag;
+
+ /* level == 0 means we can use any workspace */
+ if (!level)
+ level = 1;
+
+again:
+ ws = zstd_find_workspace(level);
+ if (ws)
+ return ws;
+
+ nofs_flag = memalloc_nofs_save();
+ ws = wsm.ops->alloc_workspace(level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(ws)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&wsm.wait, &wait);
+
+ goto again;
+ }
+
+ return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level. Here is where we continue to protect the
+ * max level workspace or update last_used accordingly. If the reclaim timer
+ * isn't set, it is also set here. Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_to_workspace(ws);
+
+ spin_lock(&wsm.lock);
+
+ /* A node is only taken off the lru if we are the corresponding level */
+ if (workspace->req_level == workspace->level) {
+ /* Hide a max level workspace from reclaim */
+ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ INIT_LIST_HEAD(&workspace->lru_list);
+ } else {
+ workspace->last_used = jiffies;
+ list_add(&workspace->lru_list, &wsm.lru_list);
+ if (!timer_pending(&wsm.timer))
+ mod_timer(&wsm.timer,
+ jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ }
+ }
+
+ set_bit(workspace->level - 1, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ workspace->req_level = 0;
+
+ spin_unlock(&wsm.lock);
+
+ if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ cond_wake_up(&wsm.wait);
+}
+
static void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +337,25 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
{
- ZSTD_parameters params =
- zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ workspace->size = zstd_ws_mem_sizes[level - 1];
+ workspace->level = level;
+ workspace->req_level = level;
+ workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
+ INIT_LIST_HEAD(&workspace->lru_list);
return &workspace->list;
fail:
@@ -95,7 +382,8 @@ static int zstd_compress_pages(struct list_head *ws,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+ ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ len);
*out_pages = 0;
*total_out = 0;
@@ -419,11 +707,19 @@ finish:
return ret;
}
-static void zstd_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zstd_set_level(unsigned int level)
{
+ if (!level)
+ return ZSTD_BTRFS_DEFAULT_LEVEL;
+
+ return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
}
const struct btrfs_compress_op btrfs_zstd_compress = {
+ .init_workspace_manager = zstd_init_workspace_manager,
+ .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+ .get_workspace = zstd_get_workspace,
+ .put_workspace = zstd_put_workspace,
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,
diff --git a/fs/buffer.c b/fs/buffer.c
index 48318fb74938..ce357602f471 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
+ /*
+ * The bio contains more than one segment which spans EOD, just return
+ * and let IO layer turn it into an EIO
+ */
+ if (truncated_bytes > bvec->bv_len)
+ return;
+
/* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
- zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ struct bio_vec bv;
+
+ mp_bvec_last_segment(bvec, &bv);
+ zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bba28a5034ba..36a8dc699448 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt)
{
spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
+ mdsc->caps_min_count = fsopt->max_readdir;
+ if (mdsc->caps_min_count < 1024)
+ mdsc->caps_min_count = 1024;
+ mdsc->caps_use_max = fsopt->caps_max;
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_max < mdsc->caps_min_count)
+ mdsc->caps_use_max = mdsc->caps_min_count;
spin_unlock(&mdsc->caps_list_lock);
}
@@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
if (!err) {
BUG_ON(have + alloc != need);
ctx->count = need;
+ ctx->used = 0;
}
spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+ struct ceph_cap_reservation *ctx)
{
+ bool reclaim = false;
+ if (!ctx->count)
+ return;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
+
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ reclaim = true;
spin_unlock(&mdsc->caps_list_lock);
+
+ if (reclaim)
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
}
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
BUG_ON(list_empty(&mdsc->caps_list));
ctx->count--;
+ ctx->used++;
mdsc->caps_reserve_count--;
mdsc->caps_use_count++;
@@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
+ opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
+ opt->caps_wanted_delay_max * HZ);
dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
}
@@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ spin_lock(&session->s_cap_lock);
+ list_move_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
if (cap->cap_gen < session->s_cap_gen)
cap->issued = cap->implemented = CEPH_CAP_PIN;
@@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
(!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
cap->queue_release = 1;
if (removed) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
removed = 0;
}
} else {
@@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
*/
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
@@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ /* encode_caps_cb() also will reset these sequence
+ * numbers. make sure sequence numbers in cap flush
+ * message match later reconnect message */
+ cap->seq = 0;
+ cap->issue_seq = 0;
+ cap->mseq = 0;
__kick_flushing_caps(mdsc, session, ci,
oldest_flush_tid);
} else {
@@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap->seq = seq;
cap->issue_seq = seq;
spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto flush_cap_releases;
+ goto done;
}
/* these will work even if we don't have a cap yet */
@@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_cap_op_name(op));
}
- goto done;
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+ ceph_put_string(extra_info.pool_ns);
+ return;
flush_cap_releases:
/*
@@ -3963,14 +3993,8 @@ flush_cap_releases:
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- iput(inode);
- ceph_put_string(extra_info.pool_ns);
- return;
+ ceph_flush_cap_releases(mdsc, session);
+ goto done;
bad:
pr_err("ceph_handle_caps: corrupt message\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index abdf98deeec4..98365e74cb4a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p)
return 0;
}
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%pd\n",
- di, dentry, dentry);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
@@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
@@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
}
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
if (!fsc->debugfs_caps)
goto out;
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
return 0;
out:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..a8f429882249 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -29,6 +29,9 @@
const struct dentry_operations ceph_dentry_ops;
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
/*
* Initialize ceph dentry state.
*/
@@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry)
di->lease_session = NULL;
di->time = jiffies;
dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
+ INIT_LIST_HEAD(&di->lease_list);
return 0;
}
@@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
goto out;
}
if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ __ceph_dentry_dir_lease_touch(di);
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
@@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
}
/*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+
+ di->flags |= CEPH_DENTRY_LEASE_LIST;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+ struct ceph_dentry_info *di)
+{
+ di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+ di->lease_gen = 0;
+ di->time = jiffies;
+ list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ di, dn, dn, di->offset);
+
+ if (!list_empty(&di->lease_list)) {
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ /* don't remove dentry from dentry lease list
+ * if its lease is valid */
+ if (__dentry_lease_is_valid(di))
+ return;
+ } else {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+ }
+
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ __dentry_dir_lease_touch(mdsc, di),
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_client *mdsc;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+ return;
+ if (list_empty(&di->lease_list))
+ return;
+
+ mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_del_init(&di->lease_list);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+ KEEP = 0,
+ DELETE = 1,
+ TOUCH = 2,
+ STOP = 4,
+};
+
+struct ceph_lease_walk_control {
+ bool dir_lease;
+ bool expire_dir_lease;
+ unsigned long nr_to_scan;
+ unsigned long dir_lease_ttl;
+};
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+ struct ceph_lease_walk_control *lwc,
+ int (*check)(struct dentry*, void*))
+{
+ struct ceph_dentry_info *di, *tmp;
+ struct dentry *dentry, *last = NULL;
+ struct list_head* list;
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ int ret = 0;
+
+ list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_for_each_entry_safe(di, tmp, list, lease_list) {
+ if (!lwc->nr_to_scan)
+ break;
+ --lwc->nr_to_scan;
+
+ dentry = di->dentry;
+ if (last == dentry)
+ break;
+
+ if (!spin_trylock(&dentry->d_lock))
+ continue;
+
+ if (dentry->d_lockref.count < 0) {
+ list_del_init(&di->lease_list);
+ goto next;
+ }
+
+ ret = check(dentry, lwc);
+ if (ret & TOUCH) {
+ /* move it into tail of dir lease list */
+ __dentry_dir_lease_touch(mdsc, di);
+ if (!last)
+ last = dentry;
+ }
+ if (ret & DELETE) {
+ /* stale lease */
+ di->flags &= ~CEPH_DENTRY_REFERENCED;
+ if (dentry->d_lockref.count > 0) {
+ /* update_dentry_lease() will re-add
+ * it to lease list, or
+ * ceph_d_delete() will return 1 when
+ * last reference is dropped */
+ list_del_init(&di->lease_list);
+ } else {
+ di->flags |= CEPH_DENTRY_SHRINK_LIST;
+ list_move_tail(&di->lease_list, &dispose);
+ dget_dlock(dentry);
+ }
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ if (ret & STOP)
+ break;
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+
+ while (!list_empty(&dispose)) {
+ di = list_first_entry(&dispose, struct ceph_dentry_info,
+ lease_list);
+ dentry = di->dentry;
+ spin_lock(&dentry->d_lock);
+
+ list_del_init(&di->lease_list);
+ di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+ if (di->flags & CEPH_DENTRY_REFERENCED) {
+ spin_lock(&mdsc->dentry_list_lock);
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ list_add_tail(&di->lease_list,
+ &mdsc->dentry_leases);
+ } else {
+ __dentry_dir_lease_touch(mdsc, di);
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+ } else {
+ freed++;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ /* ceph_d_delete() does the trick */
+ dput(dentry);
+ }
+ return freed;
+}
+
+static int __dentry_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int ret;
+
+ if (__dentry_lease_is_valid(di))
+ return STOP;
+ ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0)
+ return TOUCH;
+ return DELETE;
+}
+
+static int __dir_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_lease_walk_control *lwc = arg;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ int ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0) {
+ if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+ return STOP;
+ /* Move dentry to tail of dir lease list if we don't want
+ * to delete it. So dentries in the list are checked in a
+ * round robin manner */
+ if (!lwc->expire_dir_lease)
+ return TOUCH;
+ if (dentry->d_lockref.count > 0 ||
+ (di->flags & CEPH_DENTRY_REFERENCED))
+ return TOUCH;
+ /* invalidate dir lease */
+ di->lease_shared_gen = 0;
+ }
+ return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+ struct ceph_lease_walk_control lwc;
+ unsigned long count;
+ unsigned long freed;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ count = mdsc->caps_use_count - mdsc->caps_use_max;
+ else
+ count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ lwc.dir_lease = false;
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
+ freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ if (!lwc.nr_to_scan) /* more invalid leases */
+ return -EAGAIN;
+
+ if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+ lwc.dir_lease = true;
+ lwc.expire_dir_lease = freed < count;
+ lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+ freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ if (!lwc.nr_to_scan) /* more to check */
+ return -EAGAIN;
+
+ return freed > 0 ? 1 : 0;
+}
+
+/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
- ceph_dentry(dentry)->time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ __dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_session *session;
+
+ if (!di->lease_gen)
+ return false;
+
+ session = di->lease_session;
+ if (session) {
+ u32 gen;
+ unsigned long ttl;
+
+ spin_lock(&session->s_gen_ttl_lock);
+ gen = session->s_cap_gen;
+ ttl = session->s_cap_ttl;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, ttl) &&
+ time_before(jiffies, di->time))
+ return true;
+ }
+ di->lease_gen = 0;
+ return false;
+}
+
static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
struct inode *dir)
{
struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
struct ceph_mds_session *session = NULL;
u32 seq = 0;
+ int valid = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
+ if (di && __dentry_lease_is_valid(di)) {
+ valid = 1;
- if (di->lease_gen == gen &&
- time_before(jiffies, di->time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /*
- * We should renew. If we're in RCU walk mode
- * though, we can't do that so just return
- * -ECHILD.
- */
- if (flags & LOOKUP_RCU) {
- valid = -ECHILD;
- } else {
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(di->lease_session);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
}
}
}
@@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
}
/*
+ * Called under dentry->d_lock.
+ */
+static int __dir_lease_try_check(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *dir;
+ struct ceph_inode_info *ci;
+ int valid = 0;
+
+ if (!di->lease_shared_gen)
+ return 0;
+ if (IS_ROOT(dentry))
+ return 0;
+
+ dir = d_inode(dentry->d_parent);
+ ci = ceph_inode(dir);
+
+ if (spin_trylock(&ci->i_ceph_lock)) {
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+ valid = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ valid = -EBUSY;
+ }
+
+ if (!valid)
+ di->lease_shared_gen = 0;
+ return valid;
+}
+
+/*
* Check if directory-wide content lease/cap is valid.
*/
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
@@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
spin_unlock(&ci->i_ceph_lock);
+ if (valid)
+ __ceph_dentry_dir_lease_touch(di);
dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
dir, (unsigned)atomic_read(&ci->i_shared_gen),
dentry, (unsigned)di->lease_shared_gen, valid);
@@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid) {
- ceph_dentry_lru_touch(dentry);
- } else {
+ if (!valid)
ceph_dir_clear_complete(dir);
- }
if (!(flags & LOOKUP_RCU))
dput(parent);
@@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
/*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ /* won't release caps */
+ if (d_really_is_negative(dentry))
+ return 0;
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+ return 0;
+ /* vaild lease? */
+ di = ceph_dentry(dentry);
+ if (di) {
+ if (__dentry_lease_is_valid(di))
+ return 0;
+ if (__dir_lease_try_check(dentry))
+ return 0;
+ }
+ return 1;
+}
+
+/*
* Release our ceph_dentry_info.
*/
static void ceph_d_release(struct dentry *dentry)
@@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
spin_lock(&dentry->d_lock);
+ __dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
@@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return size - left;
}
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
- di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
/*
* Return name hash for a given dentry. This is dependent on
@@ -1531,6 +1827,7 @@ const struct inode_operations ceph_snapdir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
+ .d_delete = ceph_d_delete,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
.d_init = ceph_d_init,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 189df668b6a0..9f53c3d99304 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ off, off + len - 1);
if (ret < 0)
return ret;
@@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
@@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
file, pos, (unsigned)count, snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9d1f34d46627..e3346628efe2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
- atomic_set(&ci->i_shared_gen, 0);
+ atomic_set(&ci->i_shared_gen, 1);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
@@ -537,7 +537,7 @@ void ceph_destroy_inode(struct inode *inode)
ceph_fscache_unregister_inode_cookie(ci);
- ceph_queue_caps_release(inode);
+ __ceph_remove_caps(inode);
if (__ceph_has_any_quota(ci))
ceph_adjust_quota_realms_count(inode, false);
@@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ceph_inode_to_client(inode)->mdsc;
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ dout(" dropping residual ref to snap realm %p\n",
+ realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ } else {
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+ ci->i_snap_realm = NULL;
+ }
}
kfree(ci->i_symlink);
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
iinfo->pool_ns_len);
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rbytes = le64_to_cpu(info->rbytes);
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ci->i_dir_pin = iinfo->dir_pin;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
+ inode->i_blkbits = PAGE_SHIFT;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
inode->i_op = &ceph_file_iops;
break;
@@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-
- if (duration == 0)
+ if (duration == 0) {
+ __ceph_dentry_dir_lease_touch(di);
goto out_unlock;
+ }
if (di->lease_gen == session->s_cap_gen &&
time_before(ttl, di->time))
@@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_session = NULL;
}
- ceph_dentry_lru_touch(dentry);
-
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
di->lease_gen = session->s_cap_gen;
@@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
di->time = ttl;
+
+ __ceph_dentry_lease_touch(di);
out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
@@ -2259,10 +2270,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
+ if (ceph_snap(inode) == CEPH_NOSNAP)
+ stat->dev = inode->i_sb->s_dev;
else
- stat->dev = 0;
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..21c33ed048ed 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -20,6 +20,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
/*
* A cluster of MDS (metadata server) daemons is responsible for
* managing the file system namespace (the directory hierarchy and
@@ -46,13 +48,17 @@
*/
struct ceph_reconnect_state {
- int nr_caps;
+ struct ceph_mds_session *session;
+ int nr_caps, nr_realms;
struct ceph_pagelist *pagelist;
unsigned msg_version;
+ bool allow_multi;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
@@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops;
* mds reply parsing
*/
+static int parse_reply_info_quota(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only
+ * understand encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse individual inode info
*/
@@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
u64 features)
{
- int err = -EIO;
+ int err = 0;
+ u8 struct_v = 0;
+ if (features == (u64)-1) {
+ u32 struct_len;
+ u8 struct_compat;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+
+ ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
info->in = *p;
*p += sizeof(struct ceph_mds_reply_inode) +
sizeof(*info->in->fragtree.splits) *
@@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end,
info->xattr_data = *p;
*p += info->xattr_len;
- if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ if (features == (u64)-1) {
+ /* inline data */
ceph_decode_64_safe(p, end, info->inline_version, bad);
ceph_decode_32_safe(p, end, info->inline_len, bad);
ceph_decode_need(p, end, info->inline_len, bad);
info->inline_data = *p;
*p += info->inline_len;
- } else
- info->inline_version = CEPH_INLINE_NONE;
+ /* quota */
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ /* pool namespace */
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ /* btime, change_attr */
+ {
+ struct ceph_timespec btime;
+ u64 change_attr;
+ ceph_decode_need(p, end, sizeof(btime), bad);
+ ceph_decode_copy(p, &btime, sizeof(btime));
+ ceph_decode_64_safe(p, end, change_attr, bad);
+ }
+
+ /* dir pin */
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, info->dir_pin, bad);
+ } else {
+ info->dir_pin = -ENODATA;
+ }
+
+ *p = end;
+ } else {
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
- if (features & CEPH_FEATURE_MDS_QUOTA) {
+ info->dir_pin = -ENODATA;
+ }
+ return 0;
+bad:
+ err = -EIO;
+out_bad:
+ return err;
+}
+
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_dirfrag **dirfrag,
+ u64 features)
+{
+ if (features == (u64)-1) {
u8 struct_v, struct_compat;
u32 struct_len;
-
- /*
- * both struct_v and struct_compat are expected to be >= 1
- */
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
- if (!struct_v || !struct_compat)
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
goto bad;
ceph_decode_32_safe(p, end, struct_len, bad);
ceph_decode_need(p, end, struct_len, bad);
- ceph_decode_64_safe(p, end, info->max_bytes, bad);
- ceph_decode_64_safe(p, end, info->max_files, bad);
- } else {
- info->max_bytes = 0;
- info->max_files = 0;
+ end = *p + struct_len;
}
- info->pool_ns_len = 0;
- info->pool_ns_data = NULL;
- if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
- ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- if (info->pool_ns_len > 0) {
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- info->pool_ns_data = *p;
- *p += info->pool_ns_len;
- }
+ ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+ *dirfrag = *p;
+ *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+ if (features == (u64)-1)
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+ struct ceph_mds_reply_lease **lease,
+ u64 features)
+{
+ if (features == (u64)-1) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
}
+ ceph_decode_need(p, end, sizeof(**lease), bad);
+ *lease = *p;
+ *p += sizeof(**lease);
+ if (features == (u64)-1)
+ *p = end;
return 0;
bad:
- return err;
+ return -EIO;
}
/*
@@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end,
if (err < 0)
goto out_bad;
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_32_safe(p, end, info->dname_len, bad);
ceph_decode_need(p, end, info->dname_len, bad);
info->dname = *p;
*p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
+
+ err = parse_reply_info_lease(p, end, &info->dlease, features);
+ if (err < 0)
+ goto out_bad;
}
if (info->head->is_target) {
@@ -183,20 +313,16 @@ out_bad:
/*
* parse readdir results
*/
-static int parse_reply_info_dir(void **p, void *end,
+static int parse_reply_info_readdir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
u32 num, i = 0;
int err;
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
@@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end,
while (num) {
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- rde->name_len = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, rde->name_len, bad);
ceph_decode_need(p, end, rde->name_len, bad);
rde->name = *p;
*p += rde->name_len;
dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
- rde->lease = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
+ /* dentry lease */
+ err = parse_reply_info_lease(p, end, &rde->lease, features);
+ if (err)
+ goto out_bad;
/* inode */
err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
@@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
- if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (features == (u64)-1 ||
+ (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
if (*p == end) {
info->has_create_ino = false;
} else {
@@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end,
if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_dir(p, end, info, features);
+ return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
@@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
+ s->s_cap_gen = 1;
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
@@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+
INIT_LIST_HEAD(&s->s_cap_flushing);
mdsc->sessions[mds] = s;
@@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
+ s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- if (cap->queue_release) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
- } else {
+ if (cap->queue_release)
+ __ceph_queue_cap_release(session, cap);
+ else
old_cap = cap; /* put_cap it w/o locks held */
- }
}
if (ret < 0)
goto out;
@@ -1638,7 +1766,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_send_cap_releases(mdsc, session);
+ ceph_flush_cap_releases(mdsc, session);
return 0;
}
@@ -1681,8 +1809,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
/*
* called under s_mutex
*/
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
@@ -1774,6 +1902,81 @@ out_err:
spin_unlock(&session->s_cap_lock);
}
+static void ceph_cap_release_work(struct work_struct *work)
+{
+ struct ceph_mds_session *session =
+ container_of(work, struct ceph_mds_session, s_cap_release_work);
+
+ mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+ session->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(session->s_mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (mdsc->stopping)
+ return;
+
+ get_session(session);
+ if (queue_work(mdsc->fsc->cap_wq,
+ &session->s_cap_release_work)) {
+ dout("cap release work queued\n");
+ } else {
+ ceph_put_mds_session(session);
+ dout("failed to queue cap release work\n");
+ }
+}
+
+/*
+ * caller holds session->s_cap_lock
+ */
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+{
+ list_add_tail(&cap->session_caps, &session->s_cap_releases);
+ session->s_num_cap_releases++;
+
+ if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+ ceph_flush_cap_releases(session->s_mdsc, session);
+}
+
+static void ceph_cap_reclaim_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_reclaim_work);
+ int ret = ceph_trim_dentries(mdsc);
+ if (ret == -EAGAIN)
+ ceph_queue_cap_reclaim_work(mdsc);
+}
+
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+ dout("caps reclaim work queued\n");
+ } else {
+ dout("failed to queue caps release work\n");
+ }
+}
+
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+ int val;
+ if (!nr)
+ return;
+ val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+ if (!(val % CEPH_CAPS_PER_RELEASE)) {
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+ ceph_queue_cap_reclaim_work(mdsc);
+ }
+}
+
/*
* requests
*/
@@ -2653,7 +2856,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+ err = parse_reply_info(msg, rinfo, (u64)-1);
+ else
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -2684,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
@@ -2693,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (realm)
ceph_put_snap_realm(mdsc, realm);
- if (err == 0 && req->r_target_inode &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
- spin_unlock(&ci->i_unsafe_lock);
+ if (err == 0) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item,
+ &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
out_err:
mutex_lock(&mdsc->mutex);
@@ -2777,6 +2988,25 @@ bad:
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}
+static int __decode_and_drop_session_metadata(void **p, void *end)
+{
+ /* map<string,string> */
+ u32 n;
+ ceph_decode_32_safe(p, end, n, bad);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ return 0;
+bad:
+ return -1;
+}
+
/*
* handle a mds session control message
*/
@@ -2784,18 +3014,36 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ int mds = session->s_mds;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mds_session_head *h;
u32 op;
u64 seq;
- int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
+ unsigned long features = 0;
int wake = 0;
/* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
+ ceph_decode_need(&p, end, sizeof(*h), bad);
+ h = p;
+ p += sizeof(*h);
+
op = le32_to_cpu(h->op);
seq = le64_to_cpu(h->seq);
+ if (msg_version >= 3) {
+ u32 len;
+ /* version >= 2, metadata */
+ if (__decode_and_drop_session_metadata(&p, end) < 0)
+ goto bad;
+ /* version >= 3, feature bits */
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+ memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+ p += len;
+ }
+
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
get_session(session);
@@ -2821,6 +3069,7 @@ static void handle_session(struct ceph_mds_session *session,
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
renewed_caps(mdsc, session, 0);
wake = 1;
if (mdsc->stopping)
@@ -2947,6 +3196,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
}
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+ struct ceph_msg *reply;
+ struct ceph_pagelist *_pagelist;
+ struct page *page;
+ __le32 *addr;
+ int err = -ENOMEM;
+
+ if (!recon_state->allow_multi)
+ return -ENOSPC;
+
+ /* can't handle message that contains both caps and realm */
+ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+ /* pre-allocate new pagelist */
+ _pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!_pagelist)
+ return -ENOMEM;
+
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+ if (!reply)
+ goto fail_msg;
+
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+ if (err)
+ goto fail;
+ } else {
+ /* placeholder for nr_realms (currently encoding relams) */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+ if (err)
+ goto fail;
+
+ page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+ addr = kmap_atomic(page);
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ *addr = cpu_to_le32(recon_state->nr_caps);
+ } else {
+ /* currently encoding relams */
+ *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+ }
+ kunmap_atomic(addr);
+
+ reply->hdr.version = cpu_to_le16(5);
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+ ceph_con_send(&recon_state->session->s_con, reply);
+ ceph_pagelist_release(recon_state->pagelist);
+
+ recon_state->pagelist = _pagelist;
+ recon_state->nr_caps = 0;
+ recon_state->nr_realms = 0;
+ recon_state->msg_version = 5;
+ return 0;
+fail:
+ ceph_msg_put(reply);
+fail_msg:
+ ceph_pagelist_release(_pagelist);
+ return err;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -2966,9 +3291,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -3008,7 +3330,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks = NULL;
- size_t struct_len, total_len = 0;
+ size_t struct_len, total_len = sizeof(u64);
u8 struct_v = 0;
encode_again:
@@ -3043,7 +3365,7 @@ encode_again:
if (recon_state->msg_version >= 3) {
/* version, compat_version and struct_len */
- total_len = 2 * sizeof(u8) + sizeof(u32);
+ total_len += 2 * sizeof(u8) + sizeof(u32);
struct_v = 2;
}
/*
@@ -3060,12 +3382,19 @@ encode_again:
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
- err = ceph_pagelist_reserve(pagelist, total_len);
- if (err) {
- kfree(flocks);
- goto out_err;
+
+ if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto out_freeflocks;
+ pagelist = recon_state->pagelist;
}
+ err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err)
+ goto out_freeflocks;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
if (recon_state->msg_version >= 3) {
ceph_pagelist_encode_8(pagelist, struct_v);
ceph_pagelist_encode_8(pagelist, 1);
@@ -3077,7 +3406,7 @@ encode_again:
num_fcntl_locks, num_flock_locks);
if (struct_v >= 2)
ceph_pagelist_encode_64(pagelist, snap_follows);
-
+out_freeflocks:
kfree(flocks);
} else {
u64 pathbase = 0;
@@ -3098,20 +3427,81 @@ encode_again:
}
err = ceph_pagelist_reserve(pagelist,
- pathlen + sizeof(u32) + sizeof(rec.v1));
+ sizeof(u64) + sizeof(u32) +
+ pathlen + sizeof(rec.v1));
if (err) {
- kfree(path);
- goto out_err;
+ goto out_freepath;
}
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-
+out_freepath:
kfree(path);
}
- recon_state->nr_caps++;
out_err:
+ if (err >= 0)
+ recon_state->nr_caps++;
+ return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_reconnect_state *recon_state)
+{
+ struct rb_node *p;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ int err = 0;
+
+ if (recon_state->msg_version >= 4) {
+ err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+ if (err < 0)
+ goto fail;
+ }
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ if (recon_state->msg_version >= 4) {
+ size_t need = sizeof(u8) * 2 + sizeof(u32) +
+ sizeof(sr_rec);
+
+ if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto fail;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, need);
+ if (err)
+ goto fail;
+
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+ }
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+
+ recon_state->nr_realms++;
+ }
+fail:
return err;
}
@@ -3132,18 +3522,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *reply;
- struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
- int s_nr_caps;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
+ struct ceph_reconnect_state recon_state = {
+ .session = session,
+ };
LIST_HEAD(dispose);
pr_info("mds%d reconnect start\n", mds);
- pagelist = ceph_pagelist_alloc(GFP_NOFS);
- if (!pagelist)
+ recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!recon_state.pagelist)
goto fail_nopagelist;
reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
@@ -3187,63 +3576,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
/* replay unsafe requests */
replay_unsafe_requests(mdsc, session);
+ ceph_early_kick_flushing_caps(mdsc, session);
+
down_read(&mdsc->snap_rwsem);
- /* traverse this session's caps */
- s_nr_caps = session->s_nr_caps;
- err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
if (err)
goto fail;
- recon_state.nr_caps = 0;
- recon_state.pagelist = pagelist;
- if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
recon_state.msg_version = 3;
- else
+ recon_state.allow_multi = true;
+ } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+ recon_state.msg_version = 3;
+ } else {
recon_state.msg_version = 2;
+ }
+ /* trsaverse this session's caps */
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
- if (err < 0)
- goto fail;
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
spin_unlock(&session->s_cap_lock);
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
+ if (err < 0)
+ goto fail;
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
- goto fail;
+ /* check if all realms can be encoded into current message */
+ if (mdsc->num_snap_realms) {
+ size_t total_len =
+ recon_state.pagelist->length +
+ mdsc->num_snap_realms *
+ sizeof(struct ceph_mds_snaprealm_reconnect);
+ if (recon_state.msg_version >= 4) {
+ /* number of realms */
+ total_len += sizeof(u32);
+ /* version, compat_version and struct_len */
+ total_len += mdsc->num_snap_realms *
+ (2 * sizeof(u8) + sizeof(u32));
+ }
+ if (total_len > RECONNECT_MAX_SIZE) {
+ if (!recon_state.allow_multi) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ if (recon_state.nr_caps) {
+ err = send_reconnect_partial(&recon_state);
+ if (err)
+ goto fail;
+ }
+ recon_state.msg_version = 5;
+ }
}
- reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ err = encode_snap_realms(mdsc, &recon_state);
+ if (err < 0)
+ goto fail;
- /* raced with cap release? */
- if (s_nr_caps != recon_state.nr_caps) {
- struct page *page = list_first_entry(&pagelist->head,
- struct page, lru);
+ if (recon_state.msg_version >= 5) {
+ err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ if (recon_state.nr_caps || recon_state.nr_realms) {
+ struct page *page =
+ list_first_entry(&recon_state.pagelist->head,
+ struct page, lru);
__le32 *addr = kmap_atomic(page);
- *addr = cpu_to_le32(recon_state.nr_caps);
+ if (recon_state.nr_caps) {
+ WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ } else if (recon_state.msg_version >= 4) {
+ *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+ }
kunmap_atomic(addr);
}
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ if (recon_state.msg_version >= 4)
+ reply->hdr.compat_version = cpu_to_le16(4);
- ceph_early_kick_flushing_caps(mdsc, session);
+ reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
ceph_con_send(&session->s_con, reply);
@@ -3254,7 +3670,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
return;
fail:
@@ -3262,7 +3678,7 @@ fail:
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
fail_nomsg:
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
return;
@@ -3580,7 +3996,6 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -3628,6 +4043,12 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
+ ceph_check_delayed_caps(mdsc);
+
+ ceph_queue_cap_reclaim_work(mdsc);
+
+ ceph_trim_snapid_map(mdsc);
+
schedule_delayed(mdsc);
}
@@ -3660,6 +4081,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
+ mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
mdsc->last_tid = 0;
mdsc->oldest_tid = 0;
@@ -3677,11 +4099,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
+ INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+
+ spin_lock_init(&mdsc->dentry_list_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_leases);
+ INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+ spin_lock_init(&mdsc->snapid_map_lock);
+ mdsc->snapid_map_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snapid_map_lru);
init_rwsem(&mdsc->pool_perm_rwsem);
mdsc->pool_perm_tree = RB_ROOT;
@@ -3876,8 +4306,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
WARN_ON(!list_empty(&mdsc->cap_delay_list));
mutex_unlock(&mdsc->mutex);
+ ceph_cleanup_snapid_map(mdsc);
ceph_cleanup_empty_realms(mdsc);
+ cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
dout("stopped\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..50385a481fdb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -21,11 +21,14 @@
#define CEPHFS_FEATURE_REPLY_ENCODING 9
#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+#define CEPHFS_FEATURE_MULTI_RECONNECT 12
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
@@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in {
char *pool_ns_data;
u64 max_bytes;
u64 max_files;
+ s32 dir_pin;
};
struct ceph_mds_reply_dir_entry {
@@ -152,6 +156,7 @@ struct ceph_mds_session {
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
+ unsigned long s_features;
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
@@ -167,19 +172,20 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
+ struct ceph_cap *s_cap_iterator;
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct ceph_cap *s_cap_iterator;
+ struct work_struct s_cap_release_work;
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -310,6 +316,15 @@ struct ceph_pool_perm {
char pool_ns[];
};
+struct ceph_snapid_map {
+ struct rb_node node;
+ struct list_head lru;
+ atomic_t ref;
+ u64 snap;
+ dev_t dev;
+ unsigned long last_used;
+};
+
/*
* mds client state
*/
@@ -341,6 +356,7 @@ struct ceph_mds_client {
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
+ int num_snap_realms;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
@@ -362,6 +378,9 @@ struct ceph_mds_client {
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
+ struct work_struct cap_reclaim_work;
+ atomic_t cap_reclaim_pending;
+
/*
* Cap reservations
*
@@ -378,13 +397,18 @@ struct ceph_mds_client {
unreserved) */
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
+ int caps_use_max; /* max used caps */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_list_lock;
+ struct list_head dentry_leases; /* fifo list */
+ struct list_head dentry_dir_leases; /* lru list */
+
+ spinlock_t snapid_map_lock;
+ struct rb_root snapid_map_tree;
+ struct list_head snapid_map_lru;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
@@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..89aa37fa0f84 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,12 +3,13 @@
#include <linux/sort.h>
#include <linux/slab.h>
-
#include "super.h"
#include "mds_client.h"
-
#include <linux/ceph/decode.h>
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
+
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
@@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
+ mdsc->num_snap_realms++;
+
dout("create_snap_realm %llx %p\n", realm->ino, realm);
return realm;
}
@@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
+ mdsc->num_snap_realms--;
if (realm->parent) {
list_del_init(&realm->child_item);
@@ -616,7 +620,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ if (list_empty(&ci->i_snap_flush_item))
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
@@ -985,3 +990,154 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap)
+{
+ struct ceph_snapid_map *sm, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&mdsc->snapid_map_lock);
+ p = &mdsc->snapid_map_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap) {
+ p = &(*p)->rb_left;
+ } else if (snap < exist->snap) {
+ p = &(*p)->rb_right;
+ } else {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ break;
+ }
+ exist = NULL;
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
+ if (!sm)
+ return NULL;
+
+ ret = get_anon_bdev(&sm->dev);
+ if (ret < 0) {
+ kfree(sm);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sm->lru);
+ atomic_set(&sm->ref, 1);
+ sm->snap = snap;
+
+ exist = NULL;
+ parent = NULL;
+ p = &mdsc->snapid_map_tree.rb_node;
+ spin_lock(&mdsc->snapid_map_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap)
+ p = &(*p)->rb_left;
+ else if (snap < exist->snap)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist) {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ } else {
+ rb_link_node(&sm->node, parent, p);
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm)
+{
+ if (!sm)
+ return;
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+ if (!RB_EMPTY_NODE(&sm->node)) {
+ sm->last_used = jiffies;
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+ spin_unlock(&mdsc->snapid_map_lock);
+ } else {
+ /* already cleaned up by
+ * ceph_cleanup_snapid_map() */
+ spin_unlock(&mdsc->snapid_map_lock);
+ kfree(sm);
+ }
+ }
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ unsigned long now;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ now = jiffies;
+
+ while (!list_empty(&mdsc->snapid_map_lru)) {
+ sm = list_first_entry(&mdsc->snapid_map_lru,
+ struct ceph_snapid_map, lru);
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+ break;
+
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ }
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ struct rb_node *p;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
+ sm = rb_entry(p, struct ceph_snapid_map, node);
+ rb_erase(p, &mdsc->snapid_map_tree);
+ RB_CLEAR_NODE(p);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ free_anon_bdev(sm->dev);
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+ pr_err("snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
+ }
+ }
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index da2cd8e89062..6d5bb2f74612 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,6 +133,7 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
+ Opt_caps_max,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
@@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = {
{Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_caps_max, "caps_max=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private)
return -EINVAL;
fsopt->caps_wanted_delay_max = intval;
break;
+ case Opt_caps_max:
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->caps_max = intval;
+ break;
case Opt_readdir_max_entries:
if (intval < 1)
return -EINVAL;
@@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",rasize=%d", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_max)
+ seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
seq_printf(m, ",caps_wanted_delay_min=%d",
fsopt->caps_wanted_delay_min);
@@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (!fsc->trunc_wq)
goto fail_pg_inv_wq;
+ fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ if (!fsc->cap_wq)
+ goto fail_trunc_wq;
/* set up mempools */
err = -ENOMEM;
@@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
-
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
+ goto fail_cap_wq;
return fsc;
+fail_cap_wq:
+ destroy_workqueue(fsc->cap_wq);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
flush_workqueue(fsc->wb_wq);
flush_workqueue(fsc->pg_inv_wq);
flush_workqueue(fsc->trunc_wq);
+ flush_workqueue(fsc->cap_wq);
}
static void destroy_fs_client(struct ceph_fs_client *fsc)
@@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
+ destroy_workqueue(fsc->cap_wq);
mempool_destroy(fsc->wb_pagevec_pool);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..16c03188578e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -79,6 +79,7 @@ struct ceph_mount_options {
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
+ int caps_max;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
@@ -100,17 +101,18 @@ struct ceph_fs_client {
struct ceph_client *client;
unsigned long mount_state;
- int min_caps; /* min caps i added */
loff_t max_file_size;
struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
+ atomic_long_t writeback_count;
+
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
- atomic_long_t writeback_count;
+ struct workqueue_struct *cap_wq;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
@@ -260,17 +262,22 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ struct dentry *dentry;
struct ceph_mds_session *lease_session;
+ struct list_head lease_list;
+ unsigned flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
unsigned long time;
u64 offset;
};
+#define CEPH_DENTRY_REFERENCED 1
+#define CEPH_DENTRY_LEASE_LIST 2
+#define CEPH_DENTRY_SHRINK_LIST 4
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -318,6 +325,8 @@ struct ceph_inode_info {
/* quotas */
u64 i_max_bytes, i_max_files;
+ s32 i_dir_pin;
+
struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
@@ -370,7 +379,10 @@ struct ceph_inode_info {
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ union {
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+ };
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
@@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
@@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt);
extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
* sync write (that may/may not still update size, mtime, etc.).
@@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void __ceph_remove_caps(struct inode* inode);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
-extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req,
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 316f6ad10644..0cc42c8879e9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
ci->i_rctime.tv_nsec);
}
-/* quotas */
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
+{
+ return ci->i_dir_pin != -ENODATA;
+}
+
+static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%d", (int)ci->i_dir_pin);
+}
+/* quotas */
static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
{
bool ret = false;
@@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime),
{
+ .name = "ceph.dir.pin",
+ .name_size = sizeof("ceph.dir_pin"),
+ .getxattr_cb = ceph_vxattrcb_dir_pin,
+ .exists_cb = ceph_vxattrcb_dir_pin_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ {
.name = "ceph.quota",
.name_size = sizeof("ceph.quota"),
.getxattr_cb = ceph_vxattrcb_quota,
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f1ddc9d03c10..76724efc831c 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -117,25 +117,25 @@ config CIFS_UPCALL
secure Kerberos authentication is required). If unsure, say Y.
config CIFS_XATTR
- bool "CIFS extended attributes"
- depends on CIFS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page for details).
- CIFS maps the name of extended attributes beginning with the user
- namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
- servers without the user namespace prefix, but their names are
- seen by Linux cifs clients prefaced by the user namespace prefix.
- The system namespace (used by some filesystems to store ACLs) is
- not supported at this time.
-
- If unsure, say Y.
+ bool "CIFS extended attributes"
+ depends on CIFS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page for details).
+ CIFS maps the name of extended attributes beginning with the user
+ namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
+ servers without the user namespace prefix, but their names are
+ seen by Linux cifs clients prefaced by the user namespace prefix.
+ The system namespace (used by some filesystems to store ACLs) is
+ not supported at this time.
+
+ If unsure, say Y.
config CIFS_POSIX
- bool "CIFS POSIX Extensions"
- depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
- help
- Enabling this option will cause the cifs client to attempt to
+ bool "CIFS POSIX Extensions"
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+ help
+ Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
or later, that optionally can handle more POSIX like (rather
than Windows like) file behavior. It also enables
@@ -144,61 +144,62 @@ config CIFS_POSIX
CIFS POSIX ACL support. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support"
- depends on CIFS_XATTR && KEYS
- help
- Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller. See the man
- page for getcifsacl for more information. If unsure, say Y.
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
+ help
+ Allows fetching CIFS/NTFS ACL from the server. The DACL blob
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information. If unsure, say Y.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
default y
depends on CIFS
help
- Enabling this option adds helpful debugging messages to
- the cifs code which increases the size of the cifs module.
- If unsure, say Y.
+ Enabling this option adds helpful debugging messages to
+ the cifs code which increases the size of the cifs module.
+ If unsure, say Y.
+
config CIFS_DEBUG2
bool "Enable additional CIFS debugging routines"
depends on CIFS_DEBUG
help
- Enabling this option adds a few more debugging routines
- to the cifs code which slightly increases the size of
- the cifs module and can cause additional logging of debug
- messages in some error paths, slowing performance. This
- option can be turned off unless you are debugging
- cifs problems. If unsure, say N.
+ Enabling this option adds a few more debugging routines
+ to the cifs code which slightly increases the size of
+ the cifs module and can cause additional logging of debug
+ messages in some error paths, slowing performance. This
+ option can be turned off unless you are debugging
+ cifs problems. If unsure, say N.
config CIFS_DEBUG_DUMP_KEYS
bool "Dump encryption keys for offline decryption (Unsafe)"
depends on CIFS_DEBUG
help
- Enabling this will dump the encryption and decryption keys
- used to communicate on an encrypted share connection on the
- console. This allows Wireshark to decrypt and dissect
- encrypted network captures. Enable this carefully.
- If unsure, say N.
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+ If unsure, say N.
config CIFS_DFS_UPCALL
- bool "DFS feature support"
- depends on CIFS && KEYS
- select DNS_RESOLVER
- help
- Distributed File System (DFS) support is used to access shares
- transparently in an enterprise name space, even if the share
- moves to a different server. This feature also enables
- an upcall mechanism for CIFS which contacts userspace helper
- utilities to provide server name resolution (host names to
- IP addresses) which is needed in order to reconnect to
- servers if their addresses change or for implicit mounts of
- DFS junction points. If unsure, say Y.
+ bool "DFS feature support"
+ depends on CIFS && KEYS
+ select DNS_RESOLVER
+ help
+ Distributed File System (DFS) support is used to access shares
+ transparently in an enterprise name space, even if the share
+ moves to a different server. This feature also enables
+ an upcall mechanism for CIFS which contacts userspace helper
+ utilities to provide server name resolution (host names to
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system"
- depends on CIFS && BROKEN
- help
- Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
+ help
+ Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
@@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT
say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support"
- depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
- help
- Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
- to be cached locally on disk through the general filesystem cache
- manager. If unsure, say N.
-
+ bool "Provide CIFS client caching support"
+ depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+ help
+ Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+ to be cached locally on disk through the general filesystem cache
+ manager. If unsure, say N.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d9b99abe1243..5d83c924cc47 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
{
cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+ cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+ cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
ref->ref_flag, ref->path_consumed);
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 42f0d67f1054..ed49222abecb 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,6 +58,7 @@ struct cifs_sb_info {
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62d48d486d8f..217276b8b942 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_puts(s, "krb5");
+ seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -554,6 +554,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+ seq_printf(s, ",bsize=%u", cifs_sb->bsize);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
if (tcon->snapshot_time)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7652551a1fc4..142164ef1f05 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.17"
+#define CIFS_VERSION "2.18"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94dbdbe5be34..f293e052e351 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -216,6 +216,7 @@ struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
struct cifs_open_parms;
+struct cifs_credits;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
@@ -230,12 +231,15 @@ struct smb_version_operations {
/* check response: verify signature, map error */
int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
bool);
- void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
- const int);
+ void (*add_credits)(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits,
+ const int optype);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *, const int);
unsigned int (*get_credits)(struct mid_q_entry *);
__u64 (*get_next_mid)(struct TCP_Server_Info *);
+ void (*revert_current_mid)(struct TCP_Server_Info *server,
+ const unsigned int val);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
/*
@@ -383,8 +387,8 @@ struct smb_version_operations {
struct cifs_fid *);
/* calculate a size of SMB message */
unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
- /* check for STATUS_PENDING and process it in a positive case */
- bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+ /* check for STATUS_PENDING and process the response if yes */
+ bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
bool (*is_session_expired)(char *);
/* send oplock break response */
@@ -452,7 +456,11 @@ struct smb_version_operations {
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, unsigned int *);
+ unsigned int *, struct cifs_credits *);
+ /* adjust previously taken mtu credits to request size */
+ int (*adjust_credits)(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size);
/* check if we need to issue closedir */
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -557,6 +565,7 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -710,6 +719,11 @@ struct TCP_Server_Info {
int nr_targets;
};
+struct cifs_credits {
+ unsigned int value;
+ unsigned int instance;
+};
+
static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
@@ -731,18 +745,18 @@ has_credits(struct TCP_Server_Info *server, int *credits)
}
static inline void
-add_credits(struct TCP_Server_Info *server, const unsigned int add,
+add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
const int optype)
{
- server->ops->add_credits(server, add, optype);
+ server->ops->add_credits(server, credits, optype);
}
static inline void
-add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+add_credits_and_wake_if(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
- if (add) {
- server->ops->add_credits(server, add, optype);
+ if (credits->value) {
+ server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
}
}
@@ -753,6 +767,14 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
+static inline int
+adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ return server->ops->adjust_credits ?
+ server->ops->adjust_credits(server, credits, payload_size) : 0;
+}
+
static inline __le64
get_next_mid64(struct TCP_Server_Info *server)
{
@@ -770,6 +792,22 @@ get_next_mid(struct TCP_Server_Info *server)
return cpu_to_le16(mid);
}
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ if (server->ops->revert_current_mid)
+ server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+ const struct smb2_sync_hdr *shdr)
+{
+ unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+ return revert_current_mid(server, num > 0 ? num : 1);
+}
+
static inline __u16
get_mid(const struct smb_hdr *smb)
{
@@ -1234,7 +1272,7 @@ struct cifs_readdata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1260,7 +1298,7 @@ struct cifs_writedata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1422,6 +1460,7 @@ struct mid_q_entry {
struct kref refcount;
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
+ __u16 credits; /* number of credits consumed by this mid */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 336c116995d7..4f96b3b00a7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags);
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits);
extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
struct smb_rqst *rqst, int *resp_buf_type,
const int flags, struct kvec *resp_iov);
@@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
unsigned int size, unsigned int *num,
- unsigned int *credits);
+ struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
@@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
+ bool fsuid_only,
+ struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bb54ccf8481c..f43747c062a7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -139,8 +139,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -172,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -822,9 +822,10 @@ static void
cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
DeleteMidQEntry(mid);
- add_credits(server, 1, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
int
@@ -859,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov[1].iov_base = (char *)smb + 4;
rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
- server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+ server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -1605,16 +1606,17 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0)) {
+ server->ops->is_status_pending(buf, server)) {
cifs_discard_remaining_data(server);
return -1;
}
/* set up first two iov for signature check and to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->total_read - 4;
+ rdata->iov[0].iov_len = server->vals->header_preamble_size;
+ rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
+ rdata->iov[1].iov_len =
+ server->total_read - server->vals->header_preamble_size;
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -1713,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1750,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, 1, 0);
+ add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1809,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata)
kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
- cifs_readv_callback, NULL, rdata, 0);
+ cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -2123,14 +2126,18 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ rc = cifs_get_writable_file(CIFS_I(inode), false,
+ &wdata2->cfile);
if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
}
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2, cifs_writedata_release);
for (j = 0; j < nr_pages; j++) {
unlock_page(wdata2->pages[j]);
@@ -2145,6 +2152,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
kref_put(&wdata2->refcount, cifs_writedata_release);
if (is_retryable_error(rc))
continue;
+ i += nr_pages;
break;
}
@@ -2152,6 +2160,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
i += nr_pages;
} while (i < wdata->nr_pages);
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
if (rc != 0 && !is_retryable_error(rc))
mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2226,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -2261,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, 1, 0);
+ add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2339,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- cifs_writev_callback, NULL, wdata, 0);
+ cifs_writev_callback, NULL, wdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8463c940e0e5..b95db2b593cb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,7 +102,7 @@ enum {
Opt_backupuid, Opt_backupgid, Opt_uid,
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
- Opt_rsize, Opt_wsize, Opt_actimeo,
+ Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits,
Opt_snapshot,
@@ -204,6 +204,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_dirmode, "dirmode=%s" },
{ Opt_dirmode, "dir_mode=%s" },
{ Opt_port, "port=%s" },
+ { Opt_blocksize, "bsize=%s" },
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
@@ -348,7 +349,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
return -ENOMEM;
}
- snprintf(unc, len, "\\\\%s", server->hostname);
+ scnprintf(unc, len, "\\\\%s", server->hostname);
rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
kfree(unc);
@@ -592,6 +593,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
+ set_credits(server, 1);
spin_lock(&GlobalMid_Lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
@@ -1053,7 +1055,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, length))
+ server->ops->is_status_pending(buf, server))
return -1;
if (!mid)
@@ -1063,6 +1065,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
+static void
+smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return;
+
+ if (shdr->CreditRequest) {
+ spin_lock(&server->req_lock);
+ server->credits += le16_to_cpu(shdr->CreditRequest);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ }
+}
+
+
static int
cifs_demultiplex_thread(void *p)
{
@@ -1192,6 +1214,7 @@ next_pdu:
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_dbg(VFS, "No task to wake, unknown frame "
@@ -1203,6 +1226,7 @@ next_pdu:
if (server->ops->dump_detail)
server->ops->dump_detail(bufs[i],
server);
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
}
@@ -1486,6 +1510,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
const char *delims = "/\\";
size_t len;
+ if (unlikely(!devname || !*devname)) {
+ cifs_dbg(VFS, "Device name not specified.\n");
+ return -EINVAL;
+ }
+
/* make sure we have a valid UNC double delimiter prefix */
len = strspn(devname, delims);
if (len != 2)
@@ -1571,7 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->cred_uid = current_uid();
vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
-
+ vol->bsize = 1024 * 1024; /* can improve cp performance significantly */
/*
* default to SFM style remapping of seven reserved characters
* unless user overrides it or we negotiate CIFS POSIX where
@@ -1944,6 +1973,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
port = (unsigned short)option;
break;
+ case Opt_blocksize:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid blocksize value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ /*
+ * inode blocksize realistically should never need to be
+ * less than 16K or greater than 16M and default is 1MB.
+ * Note that small inode block sizes (e.g. 64K) can lead
+ * to very poor performance of common tools like cp and scp
+ */
+ if ((option < CIFS_MAX_MSGSIZE) ||
+ (option > (4 * SMB3_DEFAULT_IOSIZE))) {
+ cifs_dbg(VFS, "%s: Invalid blocksize\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->bsize = option;
+ break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid rsize value\n",
@@ -2609,7 +2658,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
- tcp_ses->reconnect_instance = 0;
+ tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -2770,7 +2819,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
if (tcon == NULL)
return -ENOMEM;
- snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
+ scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
/* cannot fail */
nls_codepage = load_nls_default();
@@ -3839,6 +3888,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
+ cifs_sb->bsize = pvolume_info->bsize;
/*
* Temporarily set r/wsize for matching superblock. If we end up using
* new sb then client will later negotiate it downward if needed.
@@ -4198,7 +4248,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
new_unc = kmalloc(len, GFP_KERNEL);
if (!new_unc)
return -ENOMEM;
- snprintf(new_unc, len, "\\%s", tgt);
+ scnprintf(new_unc, len, "\\%s", tgt);
kfree(vol->UNC);
vol->UNC = new_unc;
@@ -4902,8 +4952,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
if (!server->ops->need_neg(server))
return 0;
- set_credits(server, 1);
-
rc = server->ops->negotiate(xid, ses);
if (rc == 0) {
spin_lock(&GlobalMid_Lock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 659ce1b92c44..4c144c1f50eb 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1842,24 +1842,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
return NULL;
}
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only)
+/* Return -EBADF if no handle is found and general rc otherwise */
+int
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+ struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
bool any_available = false;
- int rc;
+ int rc = -EBADF;
unsigned int refind = 0;
- /* Having a null inode here (because mapping->host was set to zero by
- the VFS or MM) should not happen but we had reports of on oops (due to
- it being zero) during stress testcases so we need to check for it */
+ *ret_file = NULL;
+
+ /*
+ * Having a null inode here (because mapping->host was set to zero by
+ * the VFS or MM) should not happen but we had reports of on oops (due
+ * to it being zero) during stress testcases so we need to check for it
+ */
if (cifs_inode == NULL) {
cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
dump_stack();
- return NULL;
+ return rc;
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
@@ -1873,7 +1879,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
refind_writable:
if (refind > MAX_REOPEN_ATT) {
spin_unlock(&tcon->open_file_lock);
- return NULL;
+ return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
@@ -1885,7 +1891,8 @@ refind_writable:
/* found a good writable file */
cifsFileInfo_get(open_file);
spin_unlock(&tcon->open_file_lock);
- return open_file;
+ *ret_file = open_file;
+ return 0;
} else {
if (!inv_file)
inv_file = open_file;
@@ -1907,22 +1914,35 @@ refind_writable:
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
- if (!rc)
- return inv_file;
- else {
- spin_lock(&tcon->open_file_lock);
- list_move_tail(&inv_file->flist,
- &cifs_inode->openFileList);
- spin_unlock(&tcon->open_file_lock);
- cifsFileInfo_put(inv_file);
- ++refind;
- inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
- goto refind_writable;
+ if (!rc) {
+ *ret_file = inv_file;
+ return 0;
}
+
+ spin_lock(&tcon->open_file_lock);
+ list_move_tail(&inv_file->flist, &cifs_inode->openFileList);
+ spin_unlock(&tcon->open_file_lock);
+ cifsFileInfo_put(inv_file);
+ ++refind;
+ inv_file = NULL;
+ spin_lock(&tcon->open_file_lock);
+ goto refind_writable;
}
- return NULL;
+ return rc;
+}
+
+struct cifsFileInfo *
+find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+{
+ struct cifsFileInfo *cfile;
+ int rc;
+
+ rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ if (rc)
+ cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
+
+ return cfile;
}
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
@@ -1959,8 +1979,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- open_file = find_writable_file(CIFS_I(mapping->host), false);
- if (open_file) {
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
@@ -1970,9 +1990,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
rc = 0;
else if (bytes_written < 0)
rc = bytes_written;
+ else
+ rc = -EFAULT;
} else {
- cifs_dbg(FYI, "No writeable filehandles for inode\n");
- rc = -EIO;
+ cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
+ if (!is_retryable_error(rc))
+ rc = -EIO;
}
kunmap(page);
@@ -2079,9 +2102,9 @@ static int
wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
struct address_space *mapping, struct writeback_control *wbc)
{
- int rc = 0;
- struct TCP_Server_Info *server;
- unsigned int i;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(wdata->cfile->tlink)->ses->server;
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
@@ -2091,21 +2114,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
page_offset(wdata->pages[nr_pages - 1]),
(loff_t)PAGE_SIZE);
wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
+ wdata->pid = wdata->cfile->pid;
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- } else {
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- }
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ return rc;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
return rc;
}
@@ -2113,11 +2131,13 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct inode *inode = mapping->host;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
+ struct cifsFileInfo *cfile = NULL;
int rc = 0;
int saved_rc = 0;
unsigned int xid;
@@ -2143,11 +2163,23 @@ static int cifs_writepages(struct address_space *mapping,
server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages, wsize, credits;
+ unsigned int i, nr_pages, found_pages, wsize;
pgoff_t next = 0, tofind, saved_index = index;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+ int get_file_rc = 0;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
+ rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+
+ /* in case of an error store it to return later */
+ if (rc)
+ get_file_rc = rc;
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc != 0) {
done = true;
break;
@@ -2180,13 +2212,26 @@ retry:
continue;
}
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
+ wdata->cfile = cfile;
+ cfile = NULL;
- rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
+ get_file_rc);
+ if (is_retryable_error(get_file_rc))
+ rc = get_file_rc;
+ else
+ rc = -EBADF;
+ } else
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
/* send failure -- clean up the mess */
if (rc != 0) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
if (is_retryable_error(rc))
redirty_page_for_writepage(wbc,
@@ -2238,6 +2283,8 @@ retry:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ if (cfile)
+ cifsFileInfo_put(cfile);
free_xid(xid);
return rc;
}
@@ -2567,7 +2614,8 @@ static int
cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
@@ -2577,18 +2625,19 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
* Note: we are attempting to resend the whole wdata not in segments
*/
do {
- rc = server->ops->wait_mtu_credits(
- server, wdata->bytes, &wsize, &credits);
+ rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize,
+ &credits);
if (rc)
goto out;
if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
+ add_credits_and_wake_if(server, &credits, 0);
msleep(1000);
}
} while (wsize < wdata->bytes);
+ wdata->credits = credits;
rc = -EAGAIN;
while (rc == -EAGAIN) {
rc = 0;
@@ -2604,7 +2653,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
return 0;
}
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
out:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
@@ -2627,6 +2676,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct TCP_Server_Info *server;
struct page **pagevec;
size_t start;
+ unsigned int xid;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2634,12 +2684,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
+ xid = get_xid();
do {
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc)
break;
@@ -2731,16 +2792,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pid = pid;
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
wdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!wdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(wdata->cfile, false)))
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
@@ -2756,6 +2823,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
len -= cur_len;
} while (len > 0);
+ free_xid(xid);
return rc;
}
@@ -3028,14 +3096,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* these pages but not on the region from pos to ppos+len-1.
*/
written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ if (CIFS_CACHE_READ(cinode)) {
/*
- * Windows 7 server can delay breaking level2 oplock if a write
- * request comes - break it on the client to prevent reading
- * an old data.
+ * We have read level caching and we have just sent a write
+ * request to the server thus making data in the cache stale.
+ * Zap the cache and set oplock/lease level to NONE to avoid
+ * reading stale data from the cache. All subsequent read
+ * operations will read new data from the server.
*/
cifs_zap_mapping(inode);
- cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+ cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
inode);
cinode->oplock = 0;
}
@@ -3260,7 +3330,8 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int rsize, credits;
+ unsigned int rsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
@@ -3277,11 +3348,12 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
goto out;
if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
+ add_credits_and_wake_if(server, &credits, 0);
msleep(1000);
}
} while (rsize < rdata->bytes);
+ rdata->credits = credits;
rc = -EAGAIN;
while (rc == -EAGAIN) {
rc = 0;
@@ -3297,7 +3369,7 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
return 0;
}
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
out:
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
@@ -3311,7 +3383,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
- unsigned int npages, rsize, credits;
+ unsigned int npages, rsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
size_t cur_len;
int rc;
pid_t pid;
@@ -3331,8 +3405,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
iov_iter_advance(&direct_iov, offset - ctx->pos);
do {
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -3406,15 +3488,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
rdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
if (rc == -EAGAIN) {
@@ -3533,8 +3621,6 @@ again:
ctx->total_len = ctx->len - iov_iter_count(to);
}
- cifs_stats_bytes_read(tcon, ctx->total_len);
-
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
@@ -4095,10 +4181,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
- unsigned credits;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -4144,18 +4239,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 478003644916..53fdb5df0d2e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2080,7 +2080,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
return rc;
generic_fillattr(inode, stat);
- stat->blksize = CIFS_MAX_MSGSIZE;
+ stat->blksize = cifs_sb->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
/* old CIFS Unix Extensions doesn't return create time */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2148b0f60e5e..62216dc8f9f5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
return rc;
}
- snprintf(md5_str2, sizeof(md5_str2),
- CIFS_MF_SYMLINK_MD5_FORMAT,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(md5_str2, sizeof(md5_str2),
+ CIFS_MF_SYMLINK_MD5_FORMAT,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
if (strncmp(md5_str1, md5_str2, 17) != 0)
return -EINVAL;
@@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
return rc;
}
- snprintf(buf, buf_len,
- CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
- link_len,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(buf, buf_len,
+ CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+ link_len,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
memcpy(buf + ofs, link_str, link_len);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 32a6c020478f..f0ce27c3c6e4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
}
static void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+cifs_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
spin_lock(&server->req_lock);
- server->credits += add;
+ server->credits += credits->value;
server->in_flight--;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
@@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
remaining = tgt_total_cnt - total_in_tgt;
if (remaining < 0) {
- cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+ cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
tgt_total_cnt, total_in_tgt);
return -EPROTO;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b8b58fb4d3f..0e3570e40ff8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
- server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
if (ack_req)
cfile->oplock_break_cancelled = false;
else
cfile->oplock_break_cancelled = true;
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+ /*
+ * Set or clear flags depending on the lease state being READ.
+ * HANDLE caching flag should be added when the client starts
+ * to defer closing remote file handles with HANDLE leases.
+ */
+ if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+ set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+
queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
@@ -648,13 +659,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
- if (rsp->sync_hdr.CreditRequest) {
- spin_lock(&server->req_lock);
- server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- }
-
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6f96e2292856..085e91436da7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -67,10 +67,13 @@ change_conf(struct TCP_Server_Info *server)
}
static void
-smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+smb2_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
int *val, rc = -1;
+ unsigned int add = credits->value;
+ unsigned int instance = credits->instance;
+ bool reconnect_detected = false;
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
@@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
server->hostname, *val);
+ if ((instance == 0) || (instance == server->reconnect_instance))
+ *val += add;
+ else
+ reconnect_detected = true;
- *val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
@@ -102,7 +108,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- if (server->tcpStatus == CifsNeedReconnect)
+ if (reconnect_detected)
+ cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
+ add, instance);
+
+ if (server->tcpStatus == CifsNeedReconnect
+ || server->tcpStatus == CifsExiting)
return;
switch (rc) {
@@ -163,7 +174,7 @@ smb2_get_credits(struct mid_q_entry *mid)
static int
smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits;
@@ -189,7 +200,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
/* can deadlock with reopen */
if (scredits <= 8) {
*num = SMB2_MAX_BUFFER_SIZE;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = 0;
break;
}
@@ -198,8 +210,10 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
*num = min_t(unsigned int, size,
scredits * SMB2_MAX_BUFFER_SIZE);
- *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
- server->credits -= *credits;
+ credits->value =
+ DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ credits->instance = server->reconnect_instance;
+ server->credits -= credits->value;
server->in_flight++;
break;
}
@@ -208,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
return rc;
}
+static int
+smb2_adjust_credits(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+
+ if (!credits->value || credits->value == new_val)
+ return 0;
+
+ if (credits->value < new_val) {
+ WARN_ONCE(1, "request has less credits (%d) than required (%d)",
+ credits->value, new_val);
+ return -ENOTSUPP;
+ }
+
+ spin_lock(&server->req_lock);
+
+ if (server->reconnect_instance != credits->instance) {
+ spin_unlock(&server->req_lock);
+ cifs_dbg(VFS, "trying to return %d credits to old session\n",
+ credits->value - new_val);
+ return -EAGAIN;
+ }
+
+ server->credits += credits->value - new_val;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ credits->value = new_val;
+ return 0;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@ -219,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
return mid;
}
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ spin_lock(&GlobalMid_Lock);
+ if (server->CurrentMid >= val)
+ server->CurrentMid -= val;
+ spin_unlock(&GlobalMid_Lock);
+}
+
static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
@@ -940,6 +995,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ if (ses->server->ops->query_all_EAs) {
+ if (!ea_value) {
+ rc = ses->server->ops->query_all_EAs(xid, tcon, path,
+ ea_name, NULL, 0,
+ cifs_sb);
+ if (rc == -ENODATA)
+ goto sea_exit;
+ }
+ }
+
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -1753,14 +1818,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
* the number of credits and return true. Otherwise - return false.
*/
static bool
-smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
- if (!length) {
+ if (shdr->CreditRequest) {
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
spin_unlock(&server->req_lock);
@@ -2595,6 +2660,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server,
}
static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ server->ops->set_oplock_level(cinode,
+ set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+ 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -3210,15 +3284,15 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0))
+ server->ops->is_status_pending(buf, server))
return -1;
/* set up first two iov to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
+ rdata->iov[0].iov_len = 0;
+ rdata->iov[1].iov_base = buf;
rdata->iov[1].iov_len =
- min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4;
+ min_t(unsigned int, buf_len, server->vals->read_rsp_size);
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -3541,6 +3615,7 @@ struct smb_version_operations smb20_operations = {
.get_credits = smb2_get_credits,
.wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3635,7 +3710,9 @@ struct smb_version_operations smb21_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3646,7 +3723,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -3731,7 +3808,9 @@ struct smb_version_operations smb30_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3743,7 +3822,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3836,7 +3915,9 @@ struct smb_version_operations smb311_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3848,7 +3929,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 77b3aaa39b35..60fbe306f604 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server = tcon->ses->server;
spin_lock(&server->req_lock);
- /* Request up to 2 credits but don't go over the limit. */
+ /* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
shdr->CreditRequest = cpu_to_le16(0);
else
shdr->CreditRequest = cpu_to_le16(
min_t(int, server->max_credits -
- server->credits, 2));
+ server->credits, 10));
spin_unlock(&server->req_lock);
} else {
shdr->CreditRequest = cpu_to_le16(2);
@@ -173,8 +173,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -206,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -490,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ pneg_ctxt->Name[0] = 0x93;
+ pneg_ctxt->Name[1] = 0xAD;
+ pneg_ctxt->Name[2] = 0x25;
+ pneg_ctxt->Name[3] = 0x50;
+ pneg_ctxt->Name[4] = 0x9C;
+ pneg_ctxt->Name[5] = 0xB4;
+ pneg_ctxt->Name[6] = 0x11;
+ pneg_ctxt->Name[7] = 0xE7;
+ pneg_ctxt->Name[8] = 0xB4;
+ pneg_ctxt->Name[9] = 0x23;
+ pneg_ctxt->Name[10] = 0x83;
+ pneg_ctxt->Name[11] = 0xDE;
+ pneg_ctxt->Name[12] = 0x96;
+ pneg_ctxt->Name[13] = 0x8B;
+ pneg_ctxt->Name[14] = 0xCD;
+ pneg_ctxt->Name[15] = 0x7C;
}
static void
@@ -986,8 +1003,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
- if (rc != 0) {
+ if (rc == -EOPNOTSUPP) {
+ /*
+ * Old Windows versions or Netapp SMB server can return
+ * not supported error. Client should accept it.
+ */
+ cifs_dbg(VFS, "Server does not support validate negotiate\n");
+ return 0;
+ } else if (rc != 0) {
cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
rc = -EIO;
goto out_free_inbuf;
@@ -1614,6 +1637,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ /* Need 64 for max size write so ask for more in case not there yet */
+ req->sync_hdr.CreditRequest = cpu_to_le16(64);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -2170,6 +2196,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rqst.rq_iov = iov;
rqst.rq_nvec = n_iov;
+ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
+ FILE_WRITE_ATTRIBUTES);
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
if (rc) {
@@ -2388,6 +2416,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
goto creat_exit;
+ trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid,
+ oparms->create_options, oparms->desired_access);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -2837,6 +2868,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qinf_exit;
+ trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -2847,6 +2881,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ trace_smb3_query_info_done(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
if (dlen) {
*dlen = le32_to_cpu(rsp->OutputBufferLength);
if (!*data) {
@@ -2924,14 +2961,16 @@ smb2_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ || mid->mid_state == MID_RESPONSE_MALFORMED) {
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ }
DeleteMidQEntry(mid);
- add_credits(server, credits_received, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
void smb2_reconnect_server(struct work_struct *work)
@@ -3023,7 +3062,7 @@ SMB2_echo(struct TCP_Server_Info *server)
iov[0].iov_base = (char *)req;
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
- server, CIFS_ECHO_OP);
+ server, CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -3114,6 +3153,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+
+ trace_smb3_read_enter(0 /* xid */,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -3184,7 +3228,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
@@ -3199,7 +3243,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* result already set, check signature */
if (server->sign && !mid->decrypted) {
int rc;
@@ -3224,11 +3269,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
- if (rdata->result != -ENODATA)
- rdata->result = -EIO;
+ rdata->result = -EIO;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -3255,7 +3300,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
@@ -3285,17 +3330,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
- if (rc) {
- if (rc == -EAGAIN && rdata->credits) {
- /* credits was reset by reconnect */
- rdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
+ if (rc)
return rc;
- }
if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3305,24 +3341,24 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_sync_hdr *)buf;
- if (rdata->credits) {
+ if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += rdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- rdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (rc)
+ goto async_readv_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- smb3_handle_read_data, rdata, flags);
+ smb3_handle_read_data, rdata, flags,
+ &rdata->credits);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -3332,6 +3368,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
io_parms.offset, io_parms.length, rc);
}
+async_readv_out:
cifs_small_buf_release(buf);
return rc;
}
@@ -3378,7 +3415,10 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
- }
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
} else
@@ -3417,14 +3457,16 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
- wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
break;
@@ -3448,7 +3490,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
wdata->result = -EIO;
@@ -3481,7 +3524,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
@@ -3499,17 +3542,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
- if (rc) {
- if (rc == -EAGAIN && wdata->credits) {
- /* credits was reset by reconnect */
- wdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
- goto async_writev_out;
- }
+ if (rc)
+ return rc;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3526,6 +3560,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+
+ trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
@@ -3595,23 +3632,22 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->Length = cpu_to_le32(wdata->bytes);
#endif
- if (wdata->credits) {
+ if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += wdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- wdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ goto async_writev_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
- wdata, flags);
+ wdata, flags, &wdata->credits);
if (rc) {
trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
@@ -3674,6 +3710,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+ trace_smb3_write_enter(xid, io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
+
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
@@ -3836,6 +3876,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, output_size);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3843,18 +3886,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, persistent_fid,
+ tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
rc = 0;
- } else
+ } else {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ }
goto qdir_exit;
}
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
goto qdir_exit;
+ }
srch_inf->unicode = true;
@@ -3882,6 +3933,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
+ trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, srch_inf->entries_in_buffer);
return rc;
qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 538e2299805f..0bd4d4802701 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -288,12 +288,12 @@ struct smb2_encryption_neg_context {
__le16 Ciphers[1]; /* Ciphers[0] since only one used now */
} __packed;
-#define POSIX_CTXT_DATA_LEN 8
+#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le64 Reserved1; /* In case needed for future (eg version or caps) */
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
} __packed;
struct smb2_negotiate_rsp {
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c65ee46..d1181572758b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
+ unsigned int credits = le16_to_cpu(shdr->CreditCharge);
if (server == NULL) {
cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
temp->command = shdr->Command; /* Always LE */
temp->when_alloc = jiffies;
@@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
+ trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
@@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
return -EAGAIN;
}
+ if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return -EAGAIN;
+
if (ses->status == CifsNew) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE))
@@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
spin_lock(&GlobalMid_Lock);
list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
+
return 0;
}
@@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
smb2_seq_num_into_buf(ses->server, shdr);
rc = smb2_get_mid_entry(ses, shdr, &mid);
- if (rc)
+ if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
return ERR_PTR(rc);
+ }
+
rc = smb2_sign_rqst(rqst, ses->server);
if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
+
return mid;
}
@@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
+ if (server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return ERR_PTR(-EAGAIN);
+
smb2_seq_num_into_buf(server, shdr);
mid = smb2_mid_entry_alloc(shdr, server);
- if (mid == NULL)
+ if (mid == NULL) {
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(-ENOMEM);
+ }
rc = smb2_sign_rqst(rqst, server);
if (rc) {
+ revert_current_mid_from_hdr(server, shdr);
DeleteMidQEntry(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index a568dac7b3a1..b943b74cd246 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
char name[MAX_NAME_LEN];
int rc;
- snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
info->request_cache =
kmem_cache_create(
name,
@@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->request_mempool)
goto out1;
- snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
info->response_cache =
kmem_cache_create(
name,
@@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->response_mempool)
goto out3;
- snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
info->workqueue = create_workqueue(name);
if (!info->workqueue)
goto out4;
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 59be48206932..d8b049afa606 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -58,6 +58,7 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
/* For logging successful read or write */
@@ -100,8 +101,12 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -148,6 +153,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err);
/*
* For handle based query/set info calls
*/
+DECLARE_EVENT_CLASS(smb3_inf_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type)
+)
+
+#define DEFINE_SMB3_INF_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type))
+
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_done);
+
DECLARE_EVENT_CLASS(smb3_inf_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
@@ -270,6 +317,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
__u64 mid), \
TP_ARGS(tid, sesid, cmd, mid))
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter);
DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
DEFINE_SMB3_CMD_DONE_EVENT(ses_expired);
@@ -406,8 +454,47 @@ DEFINE_SMB3_TCON_EVENT(tcon);
/*
- * For smb2/smb3 open call
+ * For smb2/smb3 open (including create and mkdir) calls
*/
+
+DECLARE_EVENT_CLASS(smb3_open_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter);
+DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter);
+
DECLARE_EVENT_CLASS(smb3_open_err_class,
TP_PROTO(unsigned int xid,
__u32 tid,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 53532bd3f50d..7ce8a585abd6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -33,6 +33,7 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
+#include <linux/signal.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int n_vec;
unsigned int send_length = 0;
unsigned int i, j;
+ sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
@@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smbd_send(server, rqst);
goto smbd_done;
}
+
if (ssocket == NULL)
- return -ENOTSOCK;
+ return -EAGAIN;
+
+ if (signal_pending(current)) {
+ cifs_dbg(FYI, "signal is pending before sending any data\n");
+ return -EINTR;
+ }
/* cork the socket */
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
send_length += smb_rqst_len(server, &rqst[j]);
rfc1002_marker = cpu_to_be32(send_length);
+ /*
+ * We should not allow signals to interrupt the network send because
+ * any partial send will cause session reconnects thus increasing
+ * latency of system calls and overload a server with unnecessary
+ * requests.
+ */
+
+ sigfillset(&mask);
+ sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
/* Generate a rfc1002 marker for SMB2+ */
if (server->vals->header_preamble_size == 0) {
struct kvec hiov = {
@@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
send_length += 4;
@@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
@@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
}
}
-uncork:
+unmask:
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ /*
+ * If signal is pending but we have already sent the whole packet to
+ * the server we need to return success status to allow a corresponding
+ * mid entry to be kept in the pending requests queue thus allowing
+ * to handle responses from the server by the client.
+ *
+ * If only part of the packet has been sent there is no need to hide
+ * interrupt because the session will be reconnected anyway, so there
+ * won't be any response from the server to handle.
+ */
+
+ if (signal_pending(current) && (total_len != send_length)) {
+ cifs_dbg(FYI, "signal is pending after attempt to send\n");
+ rc = -EINTR;
+ }
+
/* uncork it */
val = 0;
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -451,15 +487,18 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
static int
wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits)
+ int *credits, unsigned int *instance)
{
int rc;
+ *instance = 0;
+
spin_lock(&server->req_lock);
if (timeout == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
+ *instance = server->reconnect_instance;
spin_unlock(&server->req_lock);
return 0;
}
@@ -489,6 +528,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
if (timeout != CIFS_BLOCKING_OP) {
*credits -= 1;
server->in_flight++;
+ *instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
break;
@@ -499,7 +539,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
static int
wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype)
+ const int optype, unsigned int *instance)
{
int *val;
@@ -507,15 +547,16 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
/* Since an echo is already inflight, no need to wait to send another */
if (*val <= 0 && optype == CIFS_ECHO_OP)
return -EAGAIN;
- return wait_for_free_credits(server, timeout, val);
+ return wait_for_free_credits(server, timeout, val, instance);
}
int
cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
*num = size;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = server->reconnect_instance;
return 0;
}
@@ -602,27 +643,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int
cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags)
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits)
{
int rc, timeout, optype;
struct mid_q_entry *mid;
- unsigned int credits = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
+ unsigned int instance;
timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype);
+ rc = wait_for_free_request(server, timeout, optype, &instance);
if (rc)
return rc;
- credits = 1;
- }
+ credits.value = 1;
+ credits.instance = instance;
+ } else
+ instance = exist_credits->instance;
mutex_lock(&server->srv_mutex);
+
+ /*
+ * We can't use credits obtained from the previous session to send this
+ * request. Check if there were reconnects after we obtained credits and
+ * return -EAGAIN in such cases to let callers handle it.
+ */
+ if (instance != server->reconnect_instance) {
+ mutex_unlock(&server->srv_mutex);
+ add_credits_and_wake_if(server, &credits, optype);
+ return -EAGAIN;
+ }
+
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
mutex_unlock(&server->srv_mutex);
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -647,6 +704,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
if (rc < 0) {
+ revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
cifs_delete_mid(mid);
}
@@ -656,7 +714,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc == 0)
return 0;
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return rc;
}
@@ -786,8 +844,12 @@ static void
cifs_compound_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->server;
+ struct cifs_credits credits;
+
+ credits.value = server->ops->get_credits(mid);
+ credits.instance = server->reconnect_instance;
- add_credits(server, server->ops->get_credits(mid), mid->optype);
+ add_credits(server, &credits, mid->optype);
}
static void
@@ -813,7 +875,11 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
int timeout, optype;
struct mid_q_entry *midQ[MAX_COMPOUND];
bool cancelled_mid[MAX_COMPOUND] = {false};
- unsigned int credits[MAX_COMPOUND] = {0};
+ struct cifs_credits credits[MAX_COMPOUND] = {
+ { .value = 0, .instance = 0 }
+ };
+ unsigned int instance;
+ unsigned int first_instance = 0;
char *buf;
timeout = flags & CIFS_TIMEOUT_MASK;
@@ -830,16 +896,64 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->tcpStatus == CifsExiting)
return -ENOENT;
+ spin_lock(&ses->server->req_lock);
+ if (ses->server->credits < num_rqst) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (ses->server->in_flight < num_rqst - ses->server->credits) {
+ spin_unlock(&ses->server->req_lock);
+ return -ENOTSUPP;
+ }
+ } else {
+ /* enough credits to send the whole compounded request */
+ ses->server->credits -= num_rqst;
+ ses->server->in_flight += num_rqst;
+ first_instance = ses->server->reconnect_instance;
+ }
+ spin_unlock(&ses->server->req_lock);
+
+ if (first_instance) {
+ cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst);
+ for (i = 0; i < num_rqst; i++) {
+ credits[i].value = 1;
+ credits[i].instance = first_instance;
+ }
+ goto setup_rqsts;
+ }
+
/*
+ * There are not enough credits to send the whole compound request but
+ * there are requests in flight that may bring credits from the server.
+ * This approach still leaves the possibility to be stuck waiting for
+ * credits if the server doesn't grant credits to the outstanding
+ * requests. This should be fixed by returning immediately and letting
+ * a caller fallback to sequential commands instead of compounding.
* Ensure we obtain 1 credit per request in the compound chain.
- * It can be optimized further by waiting for all the credits
- * at once but this can wait long enough if we don't have enough
- * credits due to some heavy operations in progress or the server
- * not granting us much, so a fallback to the current approach is
- * needed anyway.
*/
for (i = 0; i < num_rqst; i++) {
- rc = wait_for_free_request(ses->server, timeout, optype);
+ rc = wait_for_free_request(ses->server, timeout, optype,
+ &instance);
+
+ if (rc == 0) {
+ credits[i].value = 1;
+ credits[i].instance = instance;
+ /*
+ * All parts of the compound chain must get credits from
+ * the same session, otherwise we may end up using more
+ * credits than the server granted. If there were
+ * reconnects in between, return -EAGAIN and let callers
+ * handle it.
+ */
+ if (i == 0)
+ first_instance = instance;
+ else if (first_instance != instance) {
+ i++;
+ rc = -EAGAIN;
+ }
+ }
+
if (rc) {
/*
* We haven't sent an SMB packet to the server yet but
@@ -851,12 +965,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* requests correctly.
*/
for (j = 0; j < i; j++)
- add_credits(ses->server, 1, optype);
+ add_credits(ses->server, &credits[j], optype);
return rc;
}
- credits[i] = 1;
}
+setup_rqsts:
/*
* Make sure that we sign in the same order that we send on this socket
* and avoid races inside tcp sendmsg code that could cause corruption
@@ -865,16 +979,33 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_lock(&ses->server->srv_mutex);
+ /*
+ * All the parts of the compound chain belong obtained credits from the
+ * same session (see the appropriate checks above). In the same time
+ * there might be reconnects after those checks but before we acquired
+ * the srv_mutex. We can not use credits obtained from the previous
+ * session to send this request. Check if there were reconnects after
+ * we obtained credits and return -EAGAIN in such cases to let callers
+ * handle it.
+ */
+ if (first_instance != ses->server->reconnect_instance) {
+ mutex_unlock(&ses->server->srv_mutex);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, &credits[j], optype);
+ return -EAGAIN;
+ }
+
for (i = 0; i < num_rqst; i++) {
midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
if (IS_ERR(midQ[i])) {
+ revert_current_mid(ses->server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
- add_credits(ses->server, credits[j], optype);
+ add_credits(ses->server, &credits[j], optype);
return PTR_ERR(midQ[i]);
}
@@ -897,15 +1028,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
- if (rc < 0)
+ if (rc < 0) {
+ revert_current_mid(ses->server, num_rqst);
ses->server->sequence_number -= 2;
+ }
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
/* Sending failed for some reason - return credits back */
for (i = 0; i < num_rqst; i++)
- add_credits(ses->server, credits[i], optype);
+ add_credits(ses->server, &credits[i], optype);
goto out;
}
@@ -942,7 +1075,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
- credits[i] = 0;
+ credits[i].value = 0;
}
spin_unlock(&GlobalMid_Lock);
}
@@ -1068,6 +1201,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
if (ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1091,7 +1225,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0);
+ rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
if (rc)
return rc;
@@ -1105,7 +1239,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1141,7 +1275,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -1149,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1165,7 +1299,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_check_receive(midQ, ses->server, 0);
out:
cifs_delete_mid(midQ);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1207,6 +1341,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ unsigned int instance;
if (tcon == NULL || tcon->ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1232,7 +1367,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
+ &instance);
if (rc)
return rc;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 02b7d91c9231..f0de238000c0 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,16 +1,16 @@
config FS_ENCRYPTION
- tristate "FS Encryption (Per-file encryption)"
+ bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_AES
select CRYPTO_CBC
select CRYPTO_ECB
select CRYPTO_XTS
select CRYPTO_CTS
- select CRYPTO_CTR
select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
+ decrypted pages in the page cache. Currently Ext4,
+ F2FS and UBIFS make use of this feature.
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044c5cee..5759bcd018cd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7424f851eb5c..7da276159593 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,6 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
-#define __FS_HAS_ENCRYPTION 1
#include <linux/fscrypt.h>
#include <crypto/hash.h>
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df20ec3..56debb1fcf5e 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
return err;
if (!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
return 0;
}
@@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ENCRYPTED(new_dir) &&
!fscrypt_has_permitted_context(new_dir,
d_inode(old_dentry)))
- return -EPERM;
+ return -EXDEV;
if ((flags & RENAME_EXCHANGE) &&
IS_ENCRYPTED(old_dir) &&
!fscrypt_has_permitted_context(old_dir,
d_inode(new_dentry)))
- return -EPERM;
+ return -EXDEV;
}
return 0;
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1e11a683f63d..322ce9686bdb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -47,7 +47,7 @@ static int derive_key_aes(const u8 *master_key,
tfm = NULL;
goto out;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
@@ -257,7 +257,7 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
mode->friendly_name,
crypto_skcipher_alg(tfm)->base.cra_driver_name);
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f490de921ce8..bd7eaf9b3f00 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 29c68c5d44d5..95b5e78c22b1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -423,8 +423,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..553a3f3300ae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
+ s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
error = -ENOMEM;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ec2fb6fe6d37..9bb015bc4a83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
- bio_for_each_segment_all(bvec, bio, i) {
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76976d6e50f9..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f72846..f664da55234e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm,
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
*key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/exec.c b/fs/exec.c
index fb72d36f7823..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
if (bytes < 0) {
ret = bytes;
- goto out;
+ goto out_free;
}
if (bytes == 0)
@@ -1189,7 +1189,7 @@ no_thread_group:
flush_itimer_signals();
#endif
- if (atomic_read(&oldsighand->count) != 1) {
+ if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
@@ -1199,7 +1199,7 @@ no_thread_group:
if (!newsighand)
return -ENOMEM;
- atomic_set(&newsighand->count, 1);
+ refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
deleted file mode 100644
index 1b2d4c63a579..000000000000
--- a/fs/exofs/BUGS
+++ /dev/null
@@ -1,3 +0,0 @@
-- Out-of-space may cause a severe problem if the object (and directory entry)
- were written, but the inode attributes failed. Then if the filesystem was
- unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
deleted file mode 100644
index a364fd0965ec..000000000000
--- a/fs/exofs/Kbuild
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Kbuild for the EXOFS module
-#
-# Copyright (C) 2008 Panasas Inc. All rights reserved.
-#
-# Authors:
-# Boaz Harrosh <ooo@electrozaur.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2
-#
-# Kbuild - Gets included from the Kernels Makefile and build system
-#
-
-# ore module library
-libore-y := ore.o ore_raid.o
-obj-$(CONFIG_ORE) += libore.o
-
-exofs-y := inode.o file.o namei.o dir.o super.o sys.o
-obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
deleted file mode 100644
index 86194b2f799d..000000000000
--- a/fs/exofs/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config EXOFS_FS
- tristate "exofs: OSD based file system support"
- depends on SCSI_OSD_ULD
- help
- EXOFS is a file system that uses an OSD storage device,
- as its backing storage.
-
-# Debugging-related stuff
-config EXOFS_DEBUG
- bool "Enable debugging"
- depends on EXOFS_FS
- help
- This option enables EXOFS debug prints.
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
deleted file mode 100644
index 2daf2329c28d..000000000000
--- a/fs/exofs/Kconfig.ore
+++ /dev/null
@@ -1,14 +0,0 @@
-# ORE - Objects Raid Engine (libore.ko)
-#
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
- tristate
- depends on EXOFS_FS || PNFS_OBJLAYOUT
- select ASYNC_XOR
- select RAID6_PQ
- select ASYNC_PQ
- default SCSI_OSD_ULD
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
deleted file mode 100644
index 7d88ef566213..000000000000
--- a/fs/exofs/common.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * common.h - Common definitions for both Kernel and user-mode utilities
- *
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef __EXOFS_COM_H__
-#define __EXOFS_COM_H__
-
-#include <linux/types.h>
-
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-
-/****************************************************************************
- * Object ID related defines
- * NOTE: inode# = object ID - EXOFS_OBJ_OFF
- ****************************************************************************/
-#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
-#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
-#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
-#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
-#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
-
-/* exofs Application specific page/attribute */
-/* Inode attrs */
-# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
-# define EXOFS_ATTR_INODE_DATA 1
-# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
-# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
-/* Partition attrs */
-# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
-# define EXOFS_ATTR_SB_STATS 1
-
-/*
- * The maximum number of files we can have is limited by the size of the
- * inode number. This is the largest object ID that the file system supports.
- * Object IDs 0, 1, and 2 are always in use (see above defines).
- */
-enum {
- EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
- (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
- EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
-};
-
-/****************************************************************************
- * Misc.
- ****************************************************************************/
-#define EXOFS_BLKSHIFT 12
-#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
-
-/****************************************************************************
- * superblock-related things
- ****************************************************************************/
-#define EXOFS_SUPER_MAGIC 0x5DF5
-
-/*
- * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * This is where the in-memory superblock is stored on disk.
- */
-enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
-struct exofs_fscb {
- __le64 s_nextid; /* Only used after mkfs */
- __le64 s_numfiles; /* Only used after mkfs */
- __le32 s_version; /* == EXOFS_FSCB_VER */
- __le16 s_magic; /* Magic signature */
- __le16 s_newfs; /* Non-zero if this is a new fs */
-
- /* From here on it's a static part, only written by mkexofs */
- __le64 s_dev_table_oid; /* Resurved, not used */
- __le64 s_dev_table_count; /* == 0 means no dev_table */
-} __packed;
-
-/*
- * This struct is set on the FS partition's attributes.
- * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
- * with the create command, to atomically persist the sb writeable information.
- */
-struct exofs_sb_stats {
- __le64 s_nextid; /* Highest object ID used */
- __le64 s_numfiles; /* Number of files on fs */
-} __packed;
-
-/*
- * Describes the raid used in the FS. It is part of the device table.
- * This here is taken from the pNFS-objects definition. In exofs we
- * use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at beginning. We take care of it at exofs_device_table.
- */
-struct exofs_dt_data_map {
- __le32 cb_num_comps;
- __le64 cb_stripe_unit;
- __le32 cb_group_width;
- __le32 cb_group_depth;
- __le32 cb_mirror_cnt;
- __le32 cb_raid_algorithm;
-} __packed;
-
-/*
- * This is an osd device information descriptor. It is a single entry in
- * the exofs device table. It describes an osd target lun which
- * contains data belonging to this FS. (Same partition_id on all devices)
- */
-struct exofs_dt_device_info {
- __le32 systemid_len;
- u8 systemid[OSD_SYSTEMID_LEN];
- __le64 long_name_offset; /* If !0 then offset-in-file */
- __le32 osdname_len; /* */
- u8 osdname[44]; /* Embbeded, Usually an asci uuid */
-} __packed;
-
-/*
- * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
- * It contains the raid used for this multy-device FS and an array of
- * participating devices.
- */
-struct exofs_device_table {
- __le32 dt_version; /* == EXOFS_DT_VER */
- struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
-
- /* Resurved space For future use. Total includeing this:
- * (8 * sizeof(le64))
- */
- __le64 __Resurved[4];
-
- __le64 dt_num_devices; /* Array size */
- struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
-} __packed;
-
-/****************************************************************************
- * inode-related things
- ****************************************************************************/
-#define EXOFS_IDATA 5
-
-/*
- * The file control block - stored in an object's attributes. This is where
- * the in-memory inode is stored on disk.
- */
-struct exofs_fcb {
- __le64 i_size; /* Size of the file */
- __le16 i_mode; /* File mode */
- __le16 i_links_count; /* Links count */
- __le32 i_uid; /* Owner Uid */
- __le32 i_gid; /* Group Id */
- __le32 i_atime; /* Access time */
- __le32 i_ctime; /* Creation time */
- __le32 i_mtime; /* Modification time */
- __le32 i_flags; /* File flags (unused for now)*/
- __le32 i_generation; /* File version (for NFS) */
- __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
-};
-
-#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
-
-/* This is the Attribute the fcb is stored in */
-static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DATA,
- EXOFS_INO_ATTR_SIZE);
-
-/****************************************************************************
- * dentry-related things
- ****************************************************************************/
-#define EXOFS_NAME_LEN 255
-
-/*
- * The on-disk directory entry
- */
-struct exofs_dir_entry {
- __le64 inode_no; /* inode number */
- __le16 rec_len; /* directory entry length */
- u8 name_len; /* name length */
- u8 file_type; /* umm...file type */
- char name[EXOFS_NAME_LEN]; /* file name */
-};
-
-enum {
- EXOFS_FT_UNKNOWN,
- EXOFS_FT_REG_FILE,
- EXOFS_FT_DIR,
- EXOFS_FT_CHRDEV,
- EXOFS_FT_BLKDEV,
- EXOFS_FT_FIFO,
- EXOFS_FT_SOCK,
- EXOFS_FT_SYMLINK,
- EXOFS_FT_MAX
-};
-
-#define EXOFS_DIR_PAD 4
-#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
-#define EXOFS_DIR_REC_LEN(name_len) \
- (((name_len) + offsetof(struct exofs_dir_entry, name) + \
- EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-
-/*
- * The on-disk (optional) layout structure.
- * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
- * attribute, attached to any inode, usually to a directory.
- */
-
-enum exofs_inode_layout_gen_functions {
- LAYOUT_MOVING_WINDOW = 0,
- LAYOUT_IMPLICT = 1,
-};
-
-struct exofs_on_disk_inode_layout {
- __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
- __le16 pad;
- union {
- /* gen_func == LAYOUT_MOVING_WINDOW (default) */
- struct exofs_layout_sliding_window {
- __le32 num_devices; /* first n devices in global-table*/
- } sliding_window __packed;
-
- /* gen_func == LAYOUT_IMPLICT */
- struct exofs_layout_implict_list {
- struct exofs_dt_data_map data_map;
- /* Variable array of size data_map.cb_num_comps. These
- * are device indexes of the devices in the global table
- */
- __le32 dev_indexes[];
- } implict __packed;
- };
-} __packed;
-
-static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
-{
- return sizeof(struct exofs_on_disk_inode_layout) +
- max_devs * sizeof(__le32);
-}
-
-#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
deleted file mode 100644
index f0138674c1ed..000000000000
--- a/fs/exofs/dir.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/iversion.h>
-#include "exofs.h"
-
-static inline unsigned exofs_chunk_size(struct inode *inode)
-{
- return inode->i_sb->s_blocksize;
-}
-
-static inline void exofs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
-{
- loff_t last_byte = inode->i_size;
-
- last_byte -= page_nr << PAGE_SHIFT;
- if (last_byte > PAGE_SIZE)
- last_byte = PAGE_SIZE;
- return last_byte;
-}
-
-static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
-{
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
- int err = 0;
-
- inode_inc_iversion(dir);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- set_page_dirty(page);
-
- if (IS_DIRSYNC(dir))
- err = write_one_page(page);
- else
- unlock_page(page);
-
- return err;
-}
-
-static bool exofs_check_page(struct page *page)
-{
- struct inode *dir = page->mapping->host;
- unsigned chunk_size = exofs_chunk_size(dir);
- char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
- struct exofs_dir_entry *p;
- char *error;
-
- /* if the page is the last one in the directory */
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
- if (limit & (chunk_size - 1))
- goto Ebadsize;
- if (!limit)
- goto out;
- }
- for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
- p = (struct exofs_dir_entry *)(kaddr + offs);
- rec_len = le16_to_cpu(p->rec_len);
-
- if (rec_len < EXOFS_DIR_REC_LEN(1))
- goto Eshort;
- if (rec_len & 3)
- goto Ealign;
- if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
- goto Enamelen;
- if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
- goto Espan;
- }
- if (offs != limit)
- goto Eend;
-out:
- SetPageChecked(page);
- return true;
-
-Ebadsize:
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "size of directory(0x%lx) is not a multiple of chunk size\n",
- dir->i_ino
- );
- goto fail;
-Eshort:
- error = "rec_len is smaller than minimal";
- goto bad_entry;
-Ealign:
- error = "unaligned directory entry";
- goto bad_entry;
-Enamelen:
- error = "rec_len is too small for name_len";
- goto bad_entry;
-Espan:
- error = "directory entry across blocks";
- goto bad_entry;
-bad_entry:
- EXOFS_ERR(
- "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
- "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)),
- rec_len, p->name_len);
- goto fail;
-Eend:
- p = (struct exofs_dir_entry *)(kaddr + offs);
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "entry in directory(0x%lx) spans the page boundary"
- "offset=%lu, inode=0x%llx\n",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)));
-fail:
- SetPageError(page);
- return false;
-}
-
-static struct page *exofs_get_page(struct inode *dir, unsigned long n)
-{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
-
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !exofs_check_page(page))
- goto fail;
- }
- }
- return page;
-
-fail:
- exofs_put_page(page);
- return ERR_PTR(-EIO);
-}
-
-static inline int exofs_match(int len, const unsigned char *name,
- struct exofs_dir_entry *de)
-{
- if (len != de->name_len)
- return 0;
- if (!de->inode_no)
- return 0;
- return !memcmp(name, de->name, len);
-}
-
-static inline
-struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
-{
- return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
-}
-
-static inline unsigned
-exofs_validate_entry(char *base, unsigned offset, unsigned mask)
-{
- struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
- struct exofs_dir_entry *p =
- (struct exofs_dir_entry *)(base + (offset&mask));
- while ((char *)p < (char *)de) {
- if (p->rec_len == 0)
- break;
- p = exofs_next_entry(p);
- }
- return (char *)p - base;
-}
-
-static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
- [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
- [EXOFS_FT_REG_FILE] = DT_REG,
- [EXOFS_FT_DIR] = DT_DIR,
- [EXOFS_FT_CHRDEV] = DT_CHR,
- [EXOFS_FT_BLKDEV] = DT_BLK,
- [EXOFS_FT_FIFO] = DT_FIFO,
- [EXOFS_FT_SOCK] = DT_SOCK,
- [EXOFS_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
-};
-
-static inline
-void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
-{
- umode_t mode = inode->i_mode;
- de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
-}
-
-static int
-exofs_readdir(struct file *file, struct dir_context *ctx)
-{
- loff_t pos = ctx->pos;
- struct inode *inode = file_inode(file);
- unsigned int offset = pos & ~PAGE_MASK;
- unsigned long n = pos >> PAGE_SHIFT;
- unsigned long npages = dir_pages(inode);
- unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
-
- if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
- return 0;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct exofs_dir_entry *de;
- struct page *page = exofs_get_page(inode, n);
-
- if (IS_ERR(page)) {
- EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
- inode->i_ino);
- ctx->pos += PAGE_SIZE - offset;
- return PTR_ERR(page);
- }
- kaddr = page_address(page);
- if (unlikely(need_revalidate)) {
- if (offset) {
- offset = exofs_validate_entry(kaddr, offset,
- chunk_mask);
- ctx->pos = (n<<PAGE_SHIFT) + offset;
- }
- file->f_version = inode_query_iversion(inode);
- need_revalidate = false;
- }
- de = (struct exofs_dir_entry *)(kaddr + offset);
- limit = kaddr + exofs_last_byte(inode, n) -
- EXOFS_DIR_REC_LEN(1);
- for (; (char *)de <= limit; de = exofs_next_entry(de)) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- exofs_put_page(page);
- return -EIO;
- }
- if (de->inode_no) {
- unsigned char t;
-
- if (de->file_type < EXOFS_FT_MAX)
- t = exofs_filetype_table[de->file_type];
- else
- t = DT_UNKNOWN;
-
- if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode_no),
- t)) {
- exofs_put_page(page);
- return 0;
- }
- }
- ctx->pos += le16_to_cpu(de->rec_len);
- }
- exofs_put_page(page);
- }
- return 0;
-}
-
-struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
- struct dentry *dentry, struct page **res_page)
-{
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
- struct exofs_i_info *oi = exofs_i(dir);
- struct exofs_dir_entry *de;
-
- if (npages == 0)
- goto out;
-
- *res_page = NULL;
-
- start = oi->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
- do {
- char *kaddr;
- page = exofs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *) kaddr;
- kaddr += exofs_last_byte(dir, n) - reclen;
- while ((char *) de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: zero-length entry in "
- "directory(0x%lx)\n",
- dir->i_ino);
- exofs_put_page(page);
- goto out;
- }
- if (exofs_match(namelen, name, de))
- goto found;
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- if (++n >= npages)
- n = 0;
- } while (n != start);
-out:
- return NULL;
-
-found:
- *res_page = page;
- oi->i_dir_start_lookup = n;
- return de;
-}
-
-struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
-{
- struct page *page = exofs_get_page(dir, 0);
- struct exofs_dir_entry *de = NULL;
-
- if (!IS_ERR(page)) {
- de = exofs_next_entry(
- (struct exofs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
-}
-
-ino_t exofs_parent_ino(struct dentry *child)
-{
- struct page *page;
- struct exofs_dir_entry *de;
- ino_t ino;
-
- de = exofs_dotdot(d_inode(child), &page);
- if (!de)
- return 0;
-
- ino = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- return ino;
-}
-
-ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
-{
- ino_t res = 0;
- struct exofs_dir_entry *de;
- struct page *page;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (de) {
- res = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- }
- return res;
-}
-
-int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
- struct page *page, struct inode *inode)
-{
- loff_t pos = page_offset(page) +
- (char *) de - (char *) page_address(page);
- unsigned len = le16_to_cpu(de->rec_len);
- int err;
-
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
- if (err)
- EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
- err);
-
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, len);
- exofs_put_page(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- return err;
-}
-
-int exofs_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned chunk_size = exofs_chunk_size(dir);
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned short rec_len, name_len;
- struct page *page = NULL;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- struct exofs_dir_entry *de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- for (n = 0; n <= npages; n++) {
- char *dir_end;
-
- page = exofs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
- dir_end = kaddr + exofs_last_byte(dir, n);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
- while ((char *)de <= kaddr) {
- if ((char *)de == dir_end) {
- name_len = 0;
- rec_len = chunk_size;
- de->rec_len = cpu_to_le16(chunk_size);
- de->inode_no = 0;
- goto got_it;
- }
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_add_link: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out_unlock;
- }
- err = -EEXIST;
- if (exofs_match(namelen, name, de))
- goto out_unlock;
- name_len = EXOFS_DIR_REC_LEN(de->name_len);
- rec_len = le16_to_cpu(de->rec_len);
- if (!de->inode_no && rec_len >= reclen)
- goto got_it;
- if (rec_len >= name_len + reclen)
- goto got_it;
- de = (struct exofs_dir_entry *) ((char *) de + rec_len);
- }
- unlock_page(page);
- exofs_put_page(page);
- }
-
- EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
- dentry, inode->i_ino);
- return -EINVAL;
-
-got_it:
- pos = page_offset(page) +
- (char *)de - (char *)page_address(page);
- err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
- &page, NULL);
- if (err)
- goto out_unlock;
- if (de->inode_no) {
- struct exofs_dir_entry *de1 =
- (struct exofs_dir_entry *)((char *)de + name_len);
- de1->rec_len = cpu_to_le16(rec_len - name_len);
- de->rec_len = cpu_to_le16(name_len);
- de = de1;
- }
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- err = exofs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- sbi->s_numfiles++;
-
-out_put:
- exofs_put_page(page);
-out:
- return err;
-out_unlock:
- unlock_page(page);
- goto out_put;
-}
-
-int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
- unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
- loff_t pos;
- struct exofs_dir_entry *pde = NULL;
- struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
- int err;
-
- while (de < dir) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_delete_entry:"
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out;
- }
- pde = de;
- de = exofs_next_entry(de);
- }
- if (pde)
- from = (char *)pde - (char *)page_address(page);
- pos = page_offset(page) + from;
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
- &page, NULL);
- if (err)
- EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
- err);
- if (pde)
- pde->rec_len = cpu_to_le16(to - from);
- dir->inode_no = 0;
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
- mark_inode_dirty(inode);
- sbi->s_numfiles--;
-out:
- exofs_put_page(page);
- return err;
-}
-
-/* kept aligned on 4 bytes */
-#define THIS_DIR ".\0\0"
-#define PARENT_DIR "..\0"
-
-int exofs_make_empty(struct inode *inode, struct inode *parent)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = exofs_chunk_size(inode);
- struct exofs_dir_entry *de;
- int err;
- void *kaddr;
-
- if (!page)
- return -ENOMEM;
-
- err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
- &page, NULL);
- if (err) {
- unlock_page(page);
- goto fail;
- }
-
- kaddr = kmap_atomic(page);
- de = (struct exofs_dir_entry *)kaddr;
- de->name_len = 1;
- de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
- memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
-
- de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
- de->name_len = 2;
- de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
- de->inode_no = cpu_to_le64(parent->i_ino);
- memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
- exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr);
- err = exofs_commit_chunk(page, 0, chunk_size);
-fail:
- put_page(page);
- return err;
-}
-
-int exofs_empty_dir(struct inode *inode)
-{
- struct page *page = NULL;
- unsigned long i, npages = dir_pages(inode);
-
- for (i = 0; i < npages; i++) {
- char *kaddr;
- struct exofs_dir_entry *de;
- page = exofs_get_page(inode, i);
-
- if (IS_ERR(page))
- continue;
-
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
-
- while ((char *)de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_empty_dir: "
- "zero-length directory entry"
- "kaddr=%p, de=%p\n", kaddr, de);
- goto not_empty;
- }
- if (de->inode_no != 0) {
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (de->name_len > 2)
- goto not_empty;
- if (de->name_len < 2) {
- if (le64_to_cpu(de->inode_no) !=
- inode->i_ino)
- goto not_empty;
- } else if (de->name[1] != '.')
- goto not_empty;
- }
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- return 1;
-
-not_empty:
- exofs_put_page(page);
- return 0;
-}
-
-const struct file_operations exofs_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = exofs_readdir,
-};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
deleted file mode 100644
index 5dc392404559..000000000000
--- a/fs/exofs/exofs.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/backing-dev.h>
-#include <scsi/osd_ore.h>
-
-#include "common.h"
-
-#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define EXOFS_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define EXOFS_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-struct exofs_dev {
- struct ore_dev ored;
- unsigned did;
- unsigned urilen;
- uint8_t *uri;
- struct kobject ed_kobj;
-};
-/*
- * our extension to the in-memory superblock
- */
-struct exofs_sb_info {
- struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
- int s_timeout; /* timeout for OSD operations */
- uint64_t s_nextid; /* highest object ID used */
- uint32_t s_numfiles; /* number of files on fs */
- spinlock_t s_next_gen_lock; /* spinlock for gen # update */
- u32 s_next_generation; /* next gen # to use */
- atomic_t s_curr_pending; /* number of pending commands */
-
- struct ore_layout layout; /* Default files layout */
- struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components oc; /* comps for the partition */
- struct kobject s_kobj; /* holds per-sbi kobject */
-};
-
-/*
- * our extension to the in-memory inode
- */
-struct exofs_i_info {
- struct inode vfs_inode; /* normal in-memory inode */
- wait_queue_head_t i_wq; /* wait queue for inode */
- unsigned long i_flags; /* various atomic flags */
- uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
- uint32_t i_dir_start_lookup; /* which page to start lookup */
- uint64_t i_commit_size; /* the object's written length */
- struct ore_comp one_comp; /* same component for all devices */
- struct ore_components oc; /* inode view of the device table */
-};
-
-static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-{
- return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
-}
-
-/*
- * our inode flags
- */
-#define OBJ_2BCREATED 0 /* object will be created soon*/
-#define OBJ_CREATED 1 /* object has been created on the osd*/
-
-static inline int obj_2bcreated(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline void set_obj_2bcreated(struct exofs_i_info *oi)
-{
- set_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline int obj_created(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-static inline void set_obj_created(struct exofs_i_info *oi)
-{
- set_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi);
-static inline int wait_obj_created(struct exofs_i_info *oi)
-{
- if (likely(obj_created(oi)))
- return 0;
-
- return __exofs_wait_obj_created(oi);
-}
-
-/*
- * get to our inode from the vfs inode
- */
-static inline struct exofs_i_info *exofs_i(struct inode *inode)
-{
- return container_of(inode, struct exofs_i_info, vfs_inode);
-}
-
-/*
- * Maximum count of links to a file
- */
-#define EXOFS_LINK_MAX 32000
-
-/*************************
- * function declarations *
- *************************/
-
-/* inode.c */
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages);
-int exofs_setattr(struct dentry *, struct iattr *);
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata);
-extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, umode_t);
-extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_evict_inode(struct inode *);
-
-/* dir.c: */
-int exofs_add_link(struct dentry *, struct inode *);
-ino_t exofs_inode_by_name(struct inode *, struct dentry *);
-int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
-int exofs_make_empty(struct inode *, struct inode *);
-struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
- struct page **);
-int exofs_empty_dir(struct inode *);
-struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
-ino_t exofs_parent_ino(struct dentry *child);
-int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
- struct inode *);
-
-/* super.c */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj);
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
-
-/* sys.c */
-int exofs_sysfs_init(void);
-void exofs_sysfs_uninit(void);
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
- struct exofs_dt_device_info *dt_dev);
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
-int exofs_sysfs_odev_add(struct exofs_dev *edev,
- struct exofs_sb_info *sbi);
-void exofs_sysfs_dbg_print(void);
-
-/*********************
- * operation vectors *
- *********************/
-/* dir.c: */
-extern const struct file_operations exofs_dir_operations;
-
-/* file.c */
-extern const struct inode_operations exofs_file_inode_operations;
-extern const struct file_operations exofs_file_operations;
-
-/* inode.c */
-extern const struct address_space_operations exofs_aops;
-
-/* namei.c */
-extern const struct inode_operations exofs_dir_inode_operations;
-extern const struct inode_operations exofs_special_inode_operations;
-
-/* exofs_init_comps will initialize an ore_components device array
- * pointing to a single ore_comp struct, and a round-robin view
- * of the device table.
- * The first device of each inode is the [inode->ino % num_devices]
- * and the rest of the devices sequentially following where the
- * first device is after the last device.
- * It is assumed that the global device array at @sbi is twice
- * bigger and that the device table repeats twice.
- * See: exofs_read_lookup_dev_table()
- */
-static inline void exofs_init_comps(struct ore_components *oc,
- struct ore_comp *one_comp,
- struct exofs_sb_info *sbi, osd_id oid)
-{
- unsigned dev_mod = (unsigned)oid, first_dev;
-
- one_comp->obj.partition = sbi->one_comp.obj.partition;
- one_comp->obj.id = oid;
- exofs_make_credential(one_comp->cred, &one_comp->obj);
-
- oc->first_dev = 0;
- oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
- sbi->layout.group_count;
- oc->single_comp = EC_SINGLE_COMP;
- oc->comps = one_comp;
-
- /* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
- oc->ods = &sbi->oc.ods[first_dev];
-}
-
-#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
deleted file mode 100644
index a94594ea2aa3..000000000000
--- a/fs/exofs/file.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "exofs.h"
-
-static int exofs_release_file(struct inode *inode, struct file *filp)
-{
- return 0;
-}
-
-/* exofs_file_fsync - flush the inode to disk
- *
- * Note, in exofs all metadata is written as part of inode, regardless.
- * The writeout is synchronous
- */
-static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- int ret;
-
- ret = file_write_and_wait_range(filp, start, end);
- if (ret)
- return ret;
-
- inode_lock(inode);
- ret = sync_inode_metadata(filp->f_mapping->host, 1);
- inode_unlock(inode);
- return ret;
-}
-
-static int exofs_flush(struct file *file, fl_owner_t id)
-{
- int ret = vfs_fsync(file, 0);
- /* TODO: Flush the OSD target */
- return ret;
-}
-
-const struct file_operations exofs_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .open = generic_file_open,
- .release = exofs_release_file,
- .fsync = exofs_file_fsync,
- .flush = exofs_flush,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
-};
-
-const struct inode_operations exofs_file_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
deleted file mode 100644
index 5f81fcd383a4..000000000000
--- a/fs/exofs/inode.c
+++ /dev/null
@@ -1,1514 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages)
-{
- unsigned pages = min_t(unsigned, expected_pages,
- layout->max_io_length / PAGE_SIZE);
-
- return pages;
-}
-
-struct page_collect {
- struct exofs_sb_info *sbi;
- struct inode *inode;
- unsigned expected_pages;
- struct ore_io_state *ios;
-
- struct page **pages;
- unsigned alloc_pages;
- unsigned nr_pages;
- unsigned long length;
- loff_t pg_first; /* keep 64bit also in 32-arches */
- bool read_4_write; /* This means two things: that the read is sync
- * And the pages should not be unlocked.
- */
- struct page *that_locked_page;
-};
-
-static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
- struct inode *inode)
-{
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-
- pcol->sbi = sbi;
- pcol->inode = inode;
- pcol->expected_pages = expected_pages;
-
- pcol->ios = NULL;
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->read_4_write = false;
- pcol->that_locked_page = NULL;
-}
-
-static void _pcol_reset(struct page_collect *pcol)
-{
- pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
-
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->ios = NULL;
- pcol->that_locked_page = NULL;
-
- /* this is probably the end of the loop but in writes
- * it might not end here. don't be left with nothing
- */
- if (!pcol->expected_pages)
- pcol->expected_pages =
- exofs_max_io_pages(&pcol->sbi->layout, ~0);
-}
-
-static int pcol_try_alloc(struct page_collect *pcol)
-{
- unsigned pages;
-
- /* TODO: easily support bio chaining */
- pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-
- for (; pages; pages >>= 1) {
- pcol->pages = kmalloc_array(pages, sizeof(struct page *),
- GFP_KERNEL);
- if (likely(pcol->pages)) {
- pcol->alloc_pages = pages;
- return 0;
- }
- }
-
- EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
- pcol->expected_pages);
- return -ENOMEM;
-}
-
-static void pcol_free(struct page_collect *pcol)
-{
- kfree(pcol->pages);
- pcol->pages = NULL;
-
- if (pcol->ios) {
- ore_put_io_state(pcol->ios);
- pcol->ios = NULL;
- }
-}
-
-static int pcol_add_page(struct page_collect *pcol, struct page *page,
- unsigned len)
-{
- if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
- return -ENOMEM;
-
- pcol->pages[pcol->nr_pages++] = page;
- pcol->length += len;
- return 0;
-}
-
-enum {PAGE_WAS_NOT_IN_IO = 17};
-static int update_read_page(struct page *page, int ret)
-{
- switch (ret) {
- case 0:
- /* Everything is OK */
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- break;
- case -EFAULT:
- /* In this case we were trying to read something that wasn't on
- * disk yet - return a page full of zeroes. This should be OK,
- * because the object should be empty (if there was a write
- * before this read, the read would be waiting with the page
- * locked */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- EXOFS_DBGMSG("recovered read error\n");
- /* fall through */
- case PAGE_WAS_NOT_IN_IO:
- ret = 0; /* recovered error */
- break;
- default:
- SetPageError(page);
- }
- return ret;
-}
-
-static void update_write_page(struct page *page, int ret)
-{
- if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
- return; /* don't pass start don't collect $200 */
-
- if (ret) {
- mapping_set_error(page->mapping, ret);
- SetPageError(page);
- }
- end_page_writeback(page);
-}
-
-/* Called at the end of reads, to optionally unlock pages and update their
- * status.
- */
-static int __readpages_done(struct page_collect *pcol)
-{
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(pcol->ios, NULL);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages at end */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
- inode->i_ino, page->index,
- page_stat ? "bad_bytes" : "good_bytes");
-
- ret = update_read_page(page, page_stat);
- if (!pcol->read_4_write)
- unlock_page(page);
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- EXOFS_DBGMSG2("readpages_done END\n");
- return ret;
-}
-
-/* callback of async reads */
-static void readpages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
-
- __readpages_done(pcol);
- atomic_dec(&pcol->sbi->s_curr_pending);
- kfree(pcol);
-}
-
-static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
-{
- int i;
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
-
- if (rw == READ)
- update_read_page(page, ret);
- else
- update_write_page(page, ret);
-
- unlock_page(page);
- }
-}
-
-static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
- struct page_collect *pcol_src, struct page_collect *pcol)
-{
- /* length was wrong or offset was not page aligned */
- BUG_ON(pcol_src->nr_pages < ios->nr_pages);
-
- if (pcol_src->nr_pages > ios->nr_pages) {
- struct page **src_page;
- unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
- unsigned long len_less = pcol_src->length - ios->length;
- unsigned i;
- int ret;
-
- /* This IO was trimmed */
- pcol_src->nr_pages = ios->nr_pages;
- pcol_src->length = ios->length;
-
- /* Left over pages are passed to the next io */
- pcol->expected_pages += pages_less;
- pcol->nr_pages = pages_less;
- pcol->length = len_less;
- src_page = pcol_src->pages + pcol_src->nr_pages;
- pcol->pg_first = (*src_page)->index;
-
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < pages_less; ++i)
- pcol->pages[i] = *src_page++;
-
- EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
- "pages_less=0x%x expected_pages=0x%x "
- "next_offset=0x%llx next_len=0x%lx\n",
- pcol_src->nr_pages, pages_less, pcol->expected_pages,
- pcol->pg_first * PAGE_SIZE, pcol->length);
- }
- return 0;
-}
-
-static int read_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
-
- if (ret)
- return ret;
- }
-
- ios = pcol->ios;
- ios->pages = pcol->pages;
-
- if (pcol->read_4_write) {
- ore_read(pcol->ios);
- return __readpages_done(pcol);
- }
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
- ios->done = readpages_done;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_read(ios);
- if (unlikely(ret))
- goto err;
-
- atomic_inc(&pcol->sbi->s_curr_pending);
-
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, READ);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* readpage_strip is called either directly from readpage() or by the VFS from
- * within read_cache_pages(), to add one more page to be read. It will try to
- * collect as many contiguous pages as posible. If a discontinuity is
- * encountered, or it runs out of resources, it will submit the previous segment
- * and will start a new collection. Eventually caller must submit the last
- * segment if present.
- */
-static int readpage_strip(void *data, struct page *page)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- /* FIXME: Just for debugging, will be removed */
- if (PageUptodate(page))
- EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
- page->index);
-
- pcol->that_locked_page = page;
-
- if (page->index < end_index)
- len = PAGE_SIZE;
- else if (page->index == end_index)
- len = i_size & ~PAGE_MASK;
- else
- len = 0;
-
- if (!len || !obj_created(oi)) {
- /* this will be out of bounds, or doesn't exist yet.
- * Current page is cleared and the request is split
- */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
-
- if (!pcol->read_4_write)
- unlock_page(page);
- EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
- "read_4_write=%d index=0x%lx end_index=0x%lx "
- "splitting\n", inode->i_ino, len,
- pcol->read_4_write, page->index, end_index);
-
- return read_exec(pcol);
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- if (len != PAGE_SIZE)
- zero_user(page, len, PAGE_SIZE - len);
-
- EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (ret) {
- EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
- "this_len=0x%zx nr_pages=%u length=0x%lx\n",
- page, len, pcol->nr_pages, pcol->length);
-
- /* split the request, and start again with current page */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- goto try_again;
- }
-
- return 0;
-
-fail:
- /* SetPageError(page); ??? */
- unlock_page(page);
- return ret;
-}
-
-static int exofs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, nr_pages, mapping->host);
-
- ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
- if (ret) {
- EXOFS_ERR("read_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = read_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- return read_exec(&pcol);
-}
-
-static int _readpage(struct page *page, bool read_4_write)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- pcol.read_4_write = read_4_write;
- ret = readpage_strip(&pcol, page);
- if (ret) {
- EXOFS_ERR("_readpage => %d\n", ret);
- return ret;
- }
-
- return read_exec(&pcol);
-}
-
-/*
- * We don't need the file
- */
-static int exofs_readpage(struct file *file, struct page *page)
-{
- return _readpage(page, false);
-}
-
-/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(ios, NULL);
-
- atomic_dec(&pcol->sbi->s_curr_pending);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages to a bio */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- update_write_page(page, page_stat);
- unlock_page(page);
- EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
- inode->i_ino, page->index, page_stat);
-
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- kfree(pcol);
- EXOFS_DBGMSG2("writepages_done END\n");
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
- struct page_collect *pcol = priv;
- pgoff_t index = offset / PAGE_SIZE;
-
- if (!pcol->that_locked_page ||
- (pcol->that_locked_page->index != index)) {
- struct page *page;
- loff_t i_size = i_size_read(pcol->inode);
-
- if (offset >= i_size) {
- *uptodate = true;
- EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
- return ZERO_PAGE(0);
- }
-
- page = find_get_page(pcol->inode->i_mapping, index);
- if (!page) {
- page = find_or_create_page(pcol->inode->i_mapping,
- index, GFP_NOFS);
- if (unlikely(!page)) {
- EXOFS_DBGMSG("grab_cache_page Failed "
- "index=0x%llx\n", _LLU(index));
- return NULL;
- }
- unlock_page(page);
- }
- *uptodate = PageUptodate(page);
- EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
- return page;
- } else {
- EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
- pcol->that_locked_page->index);
- *uptodate = true;
- return pcol->that_locked_page;
- }
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
- struct page_collect *pcol = priv;
-
- if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
- EXOFS_DBGMSG2("index=0x%lx\n", page->index);
- put_page(page);
- return;
- }
- EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
- ZERO_PAGE(0) == page ? -1 : page->index);
-}
-
-static const struct _ore_r4w_op _r4w_op = {
- .get_page = &__r4w_get_page,
- .put_page = &__r4w_put_page,
-};
-
-static int write_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
- if (unlikely(ret))
- goto err;
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
-
- ios = pcol->ios;
- ios->pages = pcol_copy->pages;
- ios->done = writepages_done;
- ios->r4w = &_r4w_op;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("write_exec: ore_write() Failed\n");
- goto err;
- }
-
- atomic_inc(&pcol->sbi->s_curr_pending);
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, WRITE);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* writepage_strip is called either directly from writepage() or by the VFS from
- * within write_cache_pages(), to add one more page to be written to storage.
- * It will try to collect as many contiguous pages as possible. If a
- * discontinuity is encountered or it runs out of resources it will submit the
- * previous segment and will start a new collection.
- * Eventually caller must submit the last segment if present.
- */
-static int writepage_strip(struct page *page,
- struct writeback_control *wbc_unused, void *data)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- ret = wait_obj_created(oi);
- if (unlikely(ret))
- goto fail;
-
- if (page->index < end_index)
- /* in this case, the page is within the limits of the file */
- len = PAGE_SIZE;
- else {
- len = i_size & ~PAGE_MASK;
-
- if (page->index > end_index || !len) {
- /* in this case, the page is outside the limits
- * (truncate in progress)
- */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
- if (PageError(page))
- ClearPageError(page);
- unlock_page(page);
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
- "outside the limits\n",
- inode->i_ino, page->index);
- return 0;
- }
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
- inode->i_ino, page->index);
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (unlikely(ret)) {
- EXOFS_DBGMSG2("Failed pcol_add_page "
- "nr_pages=%u total_length=0x%lx\n",
- pcol->nr_pages, pcol->length);
-
- /* split the request, next loop will start again */
- ret = write_exec(pcol);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("write_exec failed => %d", ret);
- goto fail;
- }
-
- goto try_again;
- }
-
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
-
- return 0;
-
-fail:
- EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
- inode->i_ino, page->index, ret);
- mapping_set_error(page->mapping, -EIO);
- unlock_page(page);
- return ret;
-}
-
-static int exofs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct page_collect pcol;
- long start, end, expected_pages;
- int ret;
-
- start = wbc->range_start >> PAGE_SHIFT;
- end = (wbc->range_end == LLONG_MAX) ?
- start + mapping->nrpages :
- wbc->range_end >> PAGE_SHIFT;
-
- if (start || end)
- expected_pages = end - start + 1;
- else
- expected_pages = mapping->nrpages;
-
- if (expected_pages < 32L)
- expected_pages = 32L;
-
- EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
- "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
- mapping->host->i_ino, wbc->range_start, wbc->range_end,
- mapping->nrpages, start, end, expected_pages);
-
- _pcol_init(&pcol, expected_pages, mapping->host);
-
- ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (unlikely(ret)) {
- EXOFS_ERR("write_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = write_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- return write_exec(&pcol); /* pump the last reminder */
- } else if (pcol.nr_pages) {
- /* not SYNC let the reminder join the next writeout */
- unsigned i;
-
- for (i = 0; i < pcol.nr_pages; i++) {
- struct page *page = pcol.pages[i];
-
- end_page_writeback(page);
- set_page_dirty(page);
- unlock_page(page);
- }
- }
- return 0;
-}
-
-/*
-static int exofs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- ret = writepage_strip(page, NULL, &pcol);
- if (ret) {
- EXOFS_ERR("exofs_writepage => %d\n", ret);
- return ret;
- }
-
- return write_exec(&pcol);
-}
-*/
-/* i_mutex held using inode->i_size directly */
-static void _write_failed(struct inode *inode, loff_t to)
-{
- if (to > inode->i_size)
- truncate_pagecache(inode, inode->i_size);
-}
-
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- int ret = 0;
- struct page *page;
-
- page = *pagep;
- if (page == NULL) {
- page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
- flags);
- if (!page) {
- EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
- return -ENOMEM;
- }
- *pagep = page;
- }
-
- /* read modify write */
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- loff_t i_size = i_size_read(mapping->host);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
-
- if (page->index > end_index) {
- clear_highpage(page);
- SetPageUptodate(page);
- } else {
- ret = _readpage(page, true);
- if (ret) {
- unlock_page(page);
- EXOFS_DBGMSG("__readpage failed\n");
- }
- }
- }
- return ret;
-}
-
-static int exofs_write_begin_export(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- *pagep = NULL;
-
- return exofs_write_begin(file, mapping, pos, len, flags, pagep,
- fsdata);
-}
-
-static int exofs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = mapping->host;
- loff_t last_pos = pos + copied;
-
- if (!PageUptodate(page)) {
- if (copied < len) {
- _write_failed(inode, pos + len);
- copied = 0;
- goto out;
- }
- SetPageUptodate(page);
- }
- if (last_pos > inode->i_size) {
- i_size_write(inode, last_pos);
- mark_inode_dirty(inode);
- }
- set_page_dirty(page);
-out:
- unlock_page(page);
- put_page(page);
- return copied;
-}
-
-static int exofs_releasepage(struct page *page, gfp_t gfp)
-{
- EXOFS_DBGMSG("page 0x%lx\n", page->index);
- WARN_ON(1);
- return 0;
-}
-
-static void exofs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
- page->index, offset, length);
- WARN_ON(1);
-}
-
-
- /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- return 0;
-}
-
-const struct address_space_operations exofs_aops = {
- .readpage = exofs_readpage,
- .readpages = exofs_readpages,
- .writepage = NULL,
- .writepages = exofs_writepages,
- .write_begin = exofs_write_begin_export,
- .write_end = exofs_write_end,
- .releasepage = exofs_releasepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
- .invalidatepage = exofs_invalidatepage,
-
- /* Not implemented Yet */
- .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = exofs_direct_IO,
-
- /* With these NULL has special meaning or default is not exported */
- .migratepage = NULL,
- .launder_page = NULL,
- .is_partially_uptodate = NULL,
- .error_remove_page = NULL,
-};
-
-/******************************************************************************
- * INODE OPERATIONS
- *****************************************************************************/
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static inline int exofs_inode_is_fast_symlink(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
-
- return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
-}
-
-static int _do_truncate(struct inode *inode, loff_t newsize)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- inode->i_mtime = inode->i_ctime = current_time(inode);
-
- ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
- if (likely(!ret))
- truncate_setsize(inode, newsize);
-
- EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
- inode->i_ino, newsize, ret);
- return ret;
-}
-
-/*
- * Set inode attributes - update size attribute on OSD if needed,
- * otherwise just call generic functions.
- */
-int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- /* if we are about to modify an object, and it hasn't been
- * created yet, wait
- */
- error = wait_obj_created(exofs_i(inode));
- if (unlikely(error))
- return error;
-
- error = setattr_prepare(dentry, iattr);
- if (unlikely(error))
- return error;
-
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
- error = _do_truncate(inode, iattr->ia_size);
- if (unlikely(error))
- return error;
- }
-
- setattr_copy(inode, iattr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_FILE_LAYOUT,
- 0);
-static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DIR_LAYOUT,
- 0);
-
-/*
- * Read the Linux inode info from the OSD, and return it as is. In exofs the
- * inode info is in an application specific page/attribute of the osd-object.
- */
-static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
- struct exofs_fcb *inode)
-{
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct osd_attr attrs[] = {
- [0] = g_attr_inode_data,
- [1] = g_attr_inode_file_layout,
- [2] = g_attr_inode_dir_layout,
- };
- struct ore_io_state *ios;
- struct exofs_on_disk_inode_layout *layout;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
- _LLU(oi->one_comp.obj.id), ret);
- memset(inode, 0, sizeof(*inode));
- inode->i_mode = 0040000 | (0777 & ~022);
- /* If object is lost on target we might as well enable it's
- * delete.
- */
- ret = 0;
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
- goto out;
- }
- WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
- memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-
- ret = extract_attr_from_ios(ios, &attrs[1]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[1].len) {
- layout = attrs[1].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported files layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
- ret = extract_attr_from_ios(ios, &attrs[2]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[2].len) {
- layout = attrs[2].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported meta-data layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void __oi_init(struct exofs_i_info *oi)
-{
- init_waitqueue_head(&oi->i_wq);
- oi->i_flags = 0;
-}
-/*
- * Fill in an inode read from the OSD and set it up for use
- */
-struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
-{
- struct exofs_i_info *oi;
- struct exofs_fcb fcb;
- struct inode *inode;
- int ret;
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
- oi = exofs_i(inode);
- __oi_init(oi);
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
-
- /* read the inode from the osd */
- ret = exofs_get_inode(sb, oi, &fcb);
- if (ret)
- goto bad_inode;
-
- set_obj_created(oi);
-
- /* copy stuff from on-disk struct to in-memory struct */
- inode->i_mode = le16_to_cpu(fcb.i_mode);
- i_uid_write(inode, le32_to_cpu(fcb.i_uid));
- i_gid_write(inode, le32_to_cpu(fcb.i_gid));
- set_nlink(inode, le16_to_cpu(fcb.i_links_count));
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
- inode->i_ctime.tv_nsec =
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
- oi->i_commit_size = le64_to_cpu(fcb.i_size);
- i_size_write(inode, oi->i_commit_size);
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_generation = le32_to_cpu(fcb.i_generation);
-
- oi->i_dir_start_lookup = 0;
-
- if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
- ret = -ESTALE;
- goto bad_inode;
- }
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (fcb.i_data[0])
- inode->i_rdev =
- old_decode_dev(le32_to_cpu(fcb.i_data[0]));
- else
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(fcb.i_data[1]));
- } else {
- memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
- }
-
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- if (exofs_inode_is_fast_symlink(inode)) {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- } else {
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- }
- } else {
- inode->i_op = &exofs_special_inode_operations;
- if (fcb.i_data[0])
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(le32_to_cpu(fcb.i_data[0])));
- else
- init_special_inode(inode, inode->i_mode,
- new_decode_dev(le32_to_cpu(fcb.i_data[1])));
- }
-
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(ret);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi)
-{
- if (!obj_created(oi)) {
- EXOFS_DBGMSG("!obj_created\n");
- BUG_ON(!obj_2bcreated(oi));
- wait_event(oi->i_wq, obj_created(oi));
- EXOFS_DBGMSG("wait_event done\n");
- }
- return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
-}
-
-/*
- * Callback function from exofs_new_inode(). The important thing is that we
- * set the obj_created flag so that other methods know that the object exists on
- * the OSD.
- */
-static void create_done(struct ore_io_state *ios, void *p)
-{
- struct inode *inode = p;
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- ret = ore_check_io(ios, NULL);
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-
- if (unlikely(ret)) {
- EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
- _LLU(exofs_oi_objno(oi)),
- _LLU(oi->one_comp.obj.partition));
- /*TODO: When FS is corrupted creation can fail, object already
- * exist. Get rid of this asynchronous creation, if exist
- * increment the obj counter and try the next object. Until we
- * succeed. All these dangling objects will be made into lost
- * files by chkfs.exofs
- */
- }
-
- set_obj_created(oi);
-
- wake_up(&oi->i_wq);
-}
-
-/*
- * Set up a new inode and create an object for it on the OSD
- */
-struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct inode *inode;
- struct exofs_i_info *oi;
- struct ore_io_state *ios;
- int ret;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- oi = exofs_i(inode);
- __oi_init(oi);
-
- set_obj_2bcreated(oi);
-
- inode_init_owner(inode, dir, mode);
- inode->i_ino = sbi->s_nextid++;
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- oi->i_commit_size = inode->i_size = 0;
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
- insert_inode_hash(inode);
-
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
- exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
-
- mark_inode_dirty(inode);
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
- return ERR_PTR(ret);
- }
-
- ios->done = create_done;
- ios->private = inode;
-
- ret = ore_create(ios);
- if (ret) {
- ore_put_io_state(ios);
- return ERR_PTR(ret);
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return inode;
-}
-
-/*
- * struct to pass two arguments to update_inode's callback
- */
-struct updatei_args {
- struct exofs_sb_info *sbi;
- struct exofs_fcb fcb;
-};
-
-/*
- * Callback function from exofs_update_inode().
- */
-static void updatei_done(struct ore_io_state *ios, void *p)
-{
- struct updatei_args *args = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&args->sbi->s_curr_pending);
-
- kfree(args);
-}
-
-/*
- * Write the inode to the OSD. Just fill up the struct, and set the attribute
- * synchronously or asynchronously depending on the do_sync flag.
- */
-static int exofs_update_inode(struct inode *inode, int do_sync)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- struct osd_attr attr;
- struct exofs_fcb *fcb;
- struct updatei_args *args;
- int ret;
-
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args) {
- EXOFS_DBGMSG("Failed kzalloc of args\n");
- return -ENOMEM;
- }
-
- fcb = &args->fcb;
-
- fcb->i_mode = cpu_to_le16(inode->i_mode);
- fcb->i_uid = cpu_to_le32(i_uid_read(inode));
- fcb->i_gid = cpu_to_le32(i_gid_read(inode));
- fcb->i_links_count = cpu_to_le16(inode->i_nlink);
- fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- oi->i_commit_size = i_size_read(inode);
- fcb->i_size = cpu_to_le64(oi->i_commit_size);
- fcb->i_generation = cpu_to_le32(inode->i_generation);
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- fcb->i_data[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- fcb->i_data[1] = 0;
- } else {
- fcb->i_data[0] = 0;
- fcb->i_data[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- fcb->i_data[2] = 0;
- }
- } else
- memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- goto free_args;
- }
-
- attr = g_attr_inode_data;
- attr.val_ptr = fcb;
- ios->out_attr_len = 1;
- ios->out_attr = &attr;
-
- wait_obj_created(oi);
-
- if (!do_sync) {
- args->sbi = sbi;
- ios->done = updatei_done;
- ios->private = args;
- }
-
- ret = ore_write(ios);
- if (!do_sync && !ret) {
- atomic_inc(&sbi->s_curr_pending);
- goto out; /* deallocation in updatei_done */
- }
-
- ore_put_io_state(ios);
-free_args:
- kfree(args);
-out:
- EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
- inode->i_ino, do_sync, ret);
- return ret;
-}
-
-int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
- return exofs_update_inode(inode, 1);
-}
-
-/*
- * Callback function from exofs_delete_inode() - don't have much cleaning up to
- * do.
- */
-static void delete_done(struct ore_io_state *ios, void *p)
-{
- struct exofs_sb_info *sbi = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-}
-
-/*
- * Called when the refcount of an inode reaches zero. We remove the object
- * from the OSD here. We make sure the object was created before we try and
- * delete it.
- */
-void exofs_evict_inode(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- int ret;
-
- truncate_inode_pages_final(&inode->i_data);
-
- /* TODO: should do better here */
- if (inode->i_nlink || is_bad_inode(inode))
- goto no_delete;
-
- inode->i_size = 0;
- clear_inode(inode);
-
- /* if we are deleting an obj that hasn't been created yet, wait.
- * This also makes sure that create_done cannot be called with an
- * already evicted inode.
- */
- wait_obj_created(oi);
- /* ignore the error, attempt a remove anyway */
-
- /* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
- return;
- }
-
- ios->done = delete_done;
- ios->private = sbi;
-
- ret = ore_remove(ios);
- if (ret) {
- EXOFS_ERR("%s: ore_remove failed\n", __func__);
- ore_put_io_state(ios);
- return;
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return;
-
-no_delete:
- clear_inode(inode);
-}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
deleted file mode 100644
index 7295cd722770..000000000000
--- a/fs/exofs/namei.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "exofs.h"
-
-static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = exofs_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct inode *inode;
- ino_t ino;
-
- if (dentry->d_name.len > EXOFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- ino = exofs_inode_by_name(dir, dentry);
- inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
- return d_splice_alias(inode, dentry);
-}
-
-static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
-{
- struct inode *inode = exofs_new_inode(dir, mode);
- int err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t rdev)
-{
- struct inode *inode;
- int err;
-
- inode = exofs_new_inode(dir, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
-{
- struct super_block *sb = dir->i_sb;
- int err = -ENAMETOOLONG;
- unsigned l = strlen(symname)+1;
- struct inode *inode;
- struct exofs_i_info *oi;
-
- if (l > sb->s_blocksize)
- goto out;
-
- inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- oi = exofs_i(inode);
- if (l > sizeof(oi->i_data)) {
- /* slow symlink */
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- memset(oi->i_data, 0, sizeof(oi->i_data));
-
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
- } else {
- /* fast symlink */
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- memcpy(oi->i_data, symname, l);
- inode->i_size = l-1;
- }
- mark_inode_dirty(inode);
-
- err = exofs_add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int exofs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode->i_ctime = current_time(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return exofs_add_nondir(dentry, inode);
-}
-
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct inode *inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = exofs_new_inode(dir, S_IFDIR | mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
-
- inode_inc_link_count(inode);
-
- err = exofs_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = exofs_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int exofs_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- struct exofs_dir_entry *de;
- struct page *page;
- int err = -ENOENT;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (!de)
- goto out;
-
- err = exofs_delete_entry(de, page);
- if (err)
- goto out;
-
- inode->i_ctime = dir->i_ctime;
- inode_dec_link_count(inode);
- err = 0;
-out:
- return err;
-}
-
-static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (exofs_empty_dir(inode)) {
- err = exofs_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
- struct exofs_dir_entry *dir_de = NULL;
- struct page *old_page;
- struct exofs_dir_entry *old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = exofs_dotdot(old_inode, &dir_page);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct page *new_page;
- struct exofs_dir_entry *new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !exofs_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
- if (!new_de)
- goto out_dir;
- err = exofs_set_link(new_dir, new_de, new_page, old_inode);
- new_inode->i_ctime = current_time(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- if (err)
- goto out_dir;
- } else {
- err = exofs_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- old_inode->i_ctime = current_time(old_inode);
-
- exofs_delete_entry(old_de, old_page);
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
- inode_dec_link_count(old_dir);
- if (err)
- goto out_dir;
- }
- return 0;
-
-
-out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
-out_old:
- kunmap(old_page);
- put_page(old_page);
-out:
- return err;
-}
-
-const struct inode_operations exofs_dir_inode_operations = {
- .create = exofs_create,
- .lookup = exofs_lookup,
- .link = exofs_link,
- .unlink = exofs_unlink,
- .symlink = exofs_symlink,
- .mkdir = exofs_mkdir,
- .rmdir = exofs_rmdir,
- .mknod = exofs_mknod,
- .rename = exofs_rename,
- .setattr = exofs_setattr,
-};
-
-const struct inode_operations exofs_special_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
deleted file mode 100644
index 5331a15a61f1..000000000000
--- a/fs/exofs/ore.c
+++ /dev/null
@@ -1,1178 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <asm/div64.h>
-#include <linux/lcm.h>
-
-#include "ore_raid.h"
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
-
-/* ore_verify_layout does a couple of things:
- * 1. Given a minimum number of needed parameters fixes up the rest of the
- * members to be operatonals for the ore. The needed parameters are those
- * that are defined by the pnfs-objects layout STD.
- * 2. Check to see if the current ore code actually supports these parameters
- * for example stripe_unit must be a multple of the system PAGE_SIZE,
- * and etc...
- * 3. Cache some havily used calculations that will be needed by users.
- */
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-
-int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
-{
- u64 stripe_length;
-
- switch (layout->raid_algorithm) {
- case PNFS_OSD_RAID_0:
- layout->parity = 0;
- break;
- case PNFS_OSD_RAID_5:
- layout->parity = 1;
- break;
- case PNFS_OSD_RAID_PQ:
- layout->parity = 2;
- break;
- case PNFS_OSD_RAID_4:
- default:
- ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
- layout->raid_algorithm);
- return -EINVAL;
- }
- if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
- ORE_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(layout->stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
- if (layout->group_width) {
- if (!layout->group_depth) {
- ORE_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- if (total_comps < (layout->group_width * layout->mirrors_p1)) {
- ORE_ERR("Data Map wrong, "
- "numdevs=%d < group_width=%d * mirrors=%d\n",
- total_comps, layout->group_width,
- layout->mirrors_p1);
- return -EINVAL;
- }
- layout->group_count = total_comps / layout->mirrors_p1 /
- layout->group_width;
- } else {
- if (layout->group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %lld\n",
- _LLU(layout->group_depth));
- }
- layout->group_width = total_comps / layout->mirrors_p1;
- layout->group_depth = -1;
- layout->group_count = 1;
- }
-
- stripe_length = (u64)layout->group_width * layout->stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
- _LLU(stripe_length));
- return -EINVAL;
- }
-
- layout->max_io_length =
- (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
- (layout->group_width - layout->parity);
- if (layout->parity) {
- unsigned stripe_length =
- (layout->group_width - layout->parity) *
- layout->stripe_unit;
-
- layout->max_io_length /= stripe_length;
- layout->max_io_length *= stripe_length;
- }
- ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
-
- return 0;
-}
-EXPORT_SYMBOL(ore_verify_layout);
-
-static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
-{
- return ios->oc->comps[index & ios->oc->single_comp].cred;
-}
-
-static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
-{
- return &ios->oc->comps[index & ios->oc->single_comp].obj;
-}
-
-static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
-{
- ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
- ios->oc->first_dev, ios->oc->numdevs, index,
- ios->oc->ods);
-
- return ore_comp_dev(ios->oc, index);
-}
-
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- size_t size_ios, size_extra, size_total;
- void *ios_extra;
-
- /*
- * The desired layout looks like this, with the extra_allocation
- * items pointed at from fields within ios or per_dev:
-
- struct __alloc_all_io_state {
- struct ore_io_state ios;
- struct ore_per_dev_state per_dev[numdevs];
- union {
- struct osd_sg_entry sglist[sgs_per_dev * numdevs];
- struct page *pages[num_par_pages];
- } extra_allocation;
- } whole_allocation;
-
- */
-
- /* This should never happen, so abort early if it ever does. */
- if (sgs_per_dev && num_par_pages) {
- ORE_DBGMSG("Tried to use both pages and sglist\n");
- *pios = NULL;
- return -EINVAL;
- }
-
- if (numdevs > (INT_MAX - sizeof(*ios)) /
- sizeof(struct ore_per_dev_state))
- return -ENOMEM;
- size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs;
-
- if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry))
- return -ENOMEM;
- if (num_par_pages > INT_MAX / sizeof(struct page *))
- return -ENOMEM;
- size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs),
- sizeof(struct page *) * num_par_pages);
-
- size_total = size_ios + size_extra;
-
- if (likely(size_total <= PAGE_SIZE)) {
- ios = kzalloc(size_total, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = (char *)ios + size_ios;
- } else {
- ios = kzalloc(size_ios, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
- size_ios);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = kzalloc(size_extra, GFP_KERNEL);
- if (unlikely(!ios_extra)) {
- ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
- size_extra);
- kfree(ios);
- *pios = NULL;
- return -ENOMEM;
- }
-
- /* In this case the per_dev[0].sgilist holds the pointer to
- * be freed
- */
- ios->extra_part_alloc = true;
- }
-
- if (num_par_pages) {
- ios->parity_pages = ios_extra;
- ios->max_par_pages = num_par_pages;
- }
- if (sgs_per_dev) {
- struct osd_sg_entry *sgilist = ios_extra;
- unsigned d;
-
- for (d = 0; d < numdevs; ++d) {
- ios->per_dev[d].sglist = sgilist;
- sgilist += sgs_per_dev;
- }
- ios->sgs_per_dev = sgs_per_dev;
- }
-
- ios->layout = layout;
- ios->oc = oc;
- *pios = ios;
- return 0;
-}
-
-/* Allocate an io_state for only a single group of devices
- *
- * If a user needs to call ore_read/write() this version must be used becase it
- * allocates extra stuff for striping and raid.
- * The ore might decide to only IO less then @length bytes do to alignmets
- * and constrains as follows:
- * - The IO cannot cross group boundary.
- * - In raid5/6 The end of the IO must align at end of a stripe eg.
- * (@offset + @length) % strip_size == 0. Or the complete range is within a
- * single stripe.
- * - Memory condition only permitted a shorter IO. (A user can use @length=~0
- * And check the returned ios->length for max_io_size.)
- *
- * The caller must check returned ios->length (and/or ios->nr_pages) and
- * re-issue these pages that fall outside of ios->length
- */
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
- bool is_reading, u64 offset, u64 length,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- unsigned numdevs = layout->group_width * layout->mirrors_p1;
- unsigned sgs_per_dev = 0, max_par_pages = 0;
- int ret;
-
- if (layout->parity && length) {
- unsigned data_devs = layout->group_width - layout->parity;
- unsigned stripe_size = layout->stripe_unit * data_devs;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
- u32 remainder;
- u64 num_stripes;
- u64 num_raid_units;
-
- num_stripes = div_u64_rem(length, stripe_size, &remainder);
- if (remainder)
- ++num_stripes;
-
- num_raid_units = num_stripes * layout->parity;
-
- if (is_reading) {
- /* For reads add per_dev sglist array */
- /* TODO: Raid 6 we need twice more. Actually:
- * num_stripes / LCMdP(W,P);
- * if (W%P != 0) num_stripes *= parity;
- */
-
- /* first/last seg is split */
- num_raid_units += layout->group_width;
- sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
- } else {
- /* For Writes add parity pages array. */
- max_par_pages = num_raid_units * pages_in_unit *
- sizeof(struct page *);
- }
- }
-
- ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
- pios);
- if (unlikely(ret))
- return ret;
-
- ios = *pios;
- ios->reading = is_reading;
- ios->offset = offset;
-
- if (length) {
- ore_calc_stripe_info(layout, offset, length, &ios->si);
- ios->length = ios->si.length;
- ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
- ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
- if (layout->parity)
- _ore_post_alloc_raid_stuff(ios);
- }
-
- return 0;
-}
-EXPORT_SYMBOL(ore_get_rw_state);
-
-/* Allocate an io_state for all the devices in the comps array
- *
- * This version of io_state allocation is used mostly by create/remove
- * and trunc where we currently need all the devices. The only wastful
- * bit is the read/write_attributes with no IO. Those sites should
- * be converted to use ore_get_rw_state() with length=0
- */
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
- struct ore_io_state **pios)
-{
- return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
-}
-EXPORT_SYMBOL(ore_get_io_state);
-
-void ore_put_io_state(struct ore_io_state *ios)
-{
- if (ios) {
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
-
- if (per_dev->or)
- osd_end_request(per_dev->or);
- if (per_dev->bio)
- bio_put(per_dev->bio);
- }
-
- _ore_free_raid_stuff(ios);
- kfree(ios);
- }
-}
-EXPORT_SYMBOL(ore_put_io_state);
-
-static void _sync_done(struct ore_io_state *ios, void *p)
-{
- struct completion *waiting = p;
-
- complete(waiting);
-}
-
-static void _last_io(struct kref *kref)
-{
- struct ore_io_state *ios = container_of(
- kref, struct ore_io_state, kref);
-
- ios->done(ios, ios->private);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct ore_io_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-int ore_io_execute(struct ore_io_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- bool sync = (ios->done == NULL);
- int i, ret;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
- if (unlikely(ret)) {
- ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
- ret);
- return ret;
- }
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
- ret = 0;
-
- if (sync) {
- wait_for_completion(&wait);
- ret = ore_check_io(ios, NULL);
- }
- return ret;
-}
-
-static void _clear_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
-
- bio_for_each_segment_all(bv, bio, i) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
-{
- enum osd_err_priority acumulated_osd_err = 0;
- int acumulated_lin_err = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
- struct osd_request *or = per_dev->or;
- int ret;
-
- if (unlikely(!or))
- continue;
-
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
-
- if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
- per_dev->bio) {
- /* start read offset passed endof file.
- * Note: if we do not have bio it means read-attributes
- * In this case we should return error to caller.
- */
- _clear_bio(per_dev->bio);
- ORE_DBGMSG("start read offset passed end of file "
- "offset=0x%llx, length=0x%llx\n",
- _LLU(per_dev->offset),
- _LLU(per_dev->length));
-
- continue; /* we recovered */
- }
-
- if (on_dev_error) {
- u64 residual = ios->reading ?
- or->in.residual : or->out.residual;
- u64 offset = (ios->offset + ios->length) - residual;
- unsigned dev = per_dev->dev - ios->oc->first_dev;
- struct ore_dev *od = ios->oc->ods[dev];
-
- on_dev_error(ios, od, dev, osi.osd_err_pri,
- offset, residual);
- }
- if (osi.osd_err_pri >= acumulated_osd_err) {
- acumulated_osd_err = osi.osd_err_pri;
- acumulated_lin_err = ret;
- }
- }
-
- return acumulated_lin_err;
-}
-EXPORT_SYMBOL(ore_check_io);
-
-/*
- * L - logical offset into the file
- *
- * D - number of Data devices
- * D = group_width - parity
- *
- * U - The number of bytes in a stripe within a group
- * U = stripe_unit * D
- *
- * T - The number of bytes striped within a group of component objects
- * (before advancing to the next group)
- * T = U * group_depth
- *
- * S - The number of bytes striped across all component objects
- * before the pattern repeats
- * S = T * group_count
- *
- * M - The "major" (i.e., across all components) cycle number
- * M = L / S
- *
- * G - Counts the groups from the beginning of the major cycle
- * G = (L - (M * S)) / T [or (L % S) / T]
- *
- * H - The byte offset within the group
- * H = (L - (M * S)) % T [or (L % S) % T]
- *
- * N - The "minor" (i.e., across the group) stripe number
- * N = H / U
- *
- * C - The component index coresponding to L
- *
- * C = (H - (N * U)) / stripe_unit + G * D
- * [or (L % U) / stripe_unit + G * D]
- *
- * O - The component offset coresponding to L
- * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
- *
- * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
- * divide by parity
- * LCMdP = lcm(group_width, parity) / parity
- *
- * R - The parity Rotation stripe
- * (Note parity cycle always starts at a group's boundary)
- * R = N % LCMdP
- *
- * I = the first parity device index
- * I = (group_width + group_width - R*parity - parity) % group_width
- *
- * Craid - The component index Rotated
- * Craid = (group_width + C - R*parity) % group_width
- * (We add the group_width to avoid negative numbers modulo math)
- */
-void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- u64 length, struct ore_striping_info *si)
-{
- u32 stripe_unit = layout->stripe_unit;
- u32 group_width = layout->group_width;
- u64 group_depth = layout->group_depth;
- u32 parity = layout->parity;
-
- u32 D = group_width - parity;
- u32 U = D * stripe_unit;
- u64 T = U * group_depth;
- u64 S = T * layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodS = file_offset - M * S;
- u32 G = div64_u64(LmodS, T);
- u64 H = LmodS - G * T;
-
- u32 N = div_u64(H, U);
- u32 Nlast;
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- u32 first_dev = C - C % group_width;
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
- si->cur_comp = C - first_dev;
- si->cur_pg = si->unit_off / PAGE_SIZE;
-
- if (parity) {
- u32 LCMdP = lcm(group_width, parity) / parity;
- /* R = N % LCMdP; */
- u32 RxP = (N % LCMdP) * parity;
-
- si->par_dev = (group_width + group_width - parity - RxP) %
- group_width + first_dev;
- si->dev = (group_width + group_width + C - RxP) %
- group_width + first_dev;
- si->bytes_in_stripe = U;
- si->first_stripe_start = M * S + G * T + N * U;
- } else {
- /* Make the math correct see _prepare_one_group */
- si->par_dev = group_width;
- si->dev = C;
- }
-
- si->dev *= layout->mirrors_p1;
- si->par_dev *= layout->mirrors_p1;
- si->offset = file_offset;
- si->length = T - H;
- if (si->length > length)
- si->length = length;
-
- Nlast = div_u64(H + si->length + U - 1, U);
- si->maxdevUnits = Nlast - N;
-
- si->M = M;
-}
-EXPORT_SYMBOL(ore_calc_stripe_info);
-
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len)
-{
- unsigned pg = *cur_pg;
- struct request_queue *q =
- osd_request_queue(_ios_od(ios, per_dev->dev));
- unsigned len = cur_len;
- int ret;
-
- if (per_dev->bio == NULL) {
- unsigned bio_size;
-
- if (!ios->reading) {
- bio_size = ios->si.maxdevUnits;
- } else {
- bio_size = (ios->si.maxdevUnits + 1) *
- (ios->layout->group_width - ios->layout->parity) /
- ios->layout->group_width;
- }
- bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
-
- per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- bio_size);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
- pglen, pgbase);
- if (unlikely(pglen != added_len)) {
- /* If bi_vcnt == bi_max then this is a SW BUG */
- ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
- "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
- per_dev->bio->bi_vcnt,
- per_dev->bio->bi_max_vecs,
- BIO_MAX_PAGES_KMALLOC, cur_len);
- ret = -ENOMEM;
- goto out;
- }
- _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
-
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- ret = 0;
-out: /* we fail the complete unit on an error eg don't advance
- * per_dev->length and cur_pg. This means that we might have a bigger
- * bio than the CDB requested length (per_dev->length). That's fine
- * only the oposite is fatal.
- */
- return ret;
-}
-
-static int _add_parity_units(struct ore_io_state *ios,
- struct ore_striping_info *si,
- unsigned dev, unsigned first_dev,
- unsigned mirrors_p1, unsigned devs_in_group,
- unsigned cur_len)
-{
- unsigned do_parity;
- int ret = 0;
-
- for (do_parity = ios->layout->parity; do_parity; --do_parity) {
- struct ore_per_dev_state *per_dev;
-
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length && !per_dev->offset) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
- }
-
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
- do_parity == 1);
- if (unlikely(ret))
- break;
-
- if (do_parity != 1) {
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) %
- ios->layout->group_width;
- }
- }
-
- return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- struct ore_striping_info *si = &ios->si;
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned group_width = ios->layout->group_width;
- unsigned devs_in_group = group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned cur_pg = ios->pages_consumed;
- u64 length = ios->length;
- int ret = 0;
-
- if (!ios->pages) {
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- BUG_ON(length > si->length);
-
- while (length) {
- struct ore_per_dev_state *per_dev =
- &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length && !per_dev->offset) {
- /* First time initialize the per_dev info. */
- per_dev->dev = dev;
- if (dev == si->dev) {
- WARN_ON(dev == si->par_dev);
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off && (page_off != ios->pgbase));
- } else {
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
- per_dev, cur_len);
- if (unlikely(ret))
- goto out;
-
- length -= cur_len;
-
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) % group_width;
- if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
- if (!length && ios->sp2d) {
- /* If we are writing and this is the very last
- * stripe. then operate on parity dev.
- */
- dev = si->par_dev;
- /* If last stripe operate on parity comp */
- si->cur_comp = group_width - ios->layout->parity;
- }
-
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- ret = _add_parity_units(ios, si, dev, first_dev,
- mirrors_p1, devs_in_group,
- ios->sp2d ? length : cur_len);
- if (unlikely(ret))
- goto out;
-
- /* Rotate next par_dev backwards with wraping */
- si->par_dev = (devs_in_group + si->par_dev -
- ios->layout->parity * mirrors_p1) %
- devs_in_group + first_dev;
- /* Next stripe, start fresh */
- si->cur_comp = 0;
- si->cur_pg = 0;
- si->obj_offset += cur_len;
- si->unit_off = 0;
- }
- }
-out:
- ios->numdevs = devs_in_group;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-int ore_create(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_create_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_create);
-
-int ore_remove(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_remove_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_remove);
-
-static int _write_mirror(struct ore_io_state *ios, int cur_comp)
-{
- struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret = 0;
-
- if (ios->pages && !master_dev->length)
- return 0; /* Just an empty slot */
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- struct bio *bio;
-
- if (per_dev != master_dev) {
- bio = bio_clone_fast(master_dev->bio,
- GFP_KERNEL, NULL);
- if (unlikely(!bio)) {
- ORE_DBGMSG(
- "Failed to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto out;
- }
-
- bio->bi_disk = NULL;
- bio->bi_next = NULL;
- per_dev->offset = master_dev->offset;
- per_dev->length = master_dev->length;
- per_dev->bio = bio;
- per_dev->dev = dev;
- } else {
- bio = master_dev->bio;
- /* FIXME: bio_set_dir() */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- }
-
- osd_req_write(or, _ios_obj(ios, cur_comp),
- per_dev->offset, bio, per_dev->length);
- ORE_DBGMSG("write(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(per_dev->length), dev);
- } else if (ios->kern_buff) {
- per_dev->offset = ios->si.obj_offset;
- per_dev->dev = ios->si.dev + dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (ios->si.unit_off + ios->length >
- ios->layout->stripe_unit));
-
- ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
- per_dev->offset,
- ios->kern_buff, ios->length);
- if (unlikely(ret))
- goto out;
- ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(ios->length), per_dev->dev);
- } else {
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- ios->out_attr_len, dev);
- }
-
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr,
- ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr,
- ios->in_attr_len);
- }
-
-out:
- return ret;
-}
-
-int ore_write(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- if (unlikely(ios->sp2d && !ios->r4w)) {
- /* A library is attempting a RAID-write without providing
- * a pages lock interface.
- */
- WARN_ON_ONCE(1);
- return -ENOTSUPP;
- }
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _write_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_write);
-
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
-{
- struct osd_request *or;
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
- unsigned first_dev = (unsigned)obj->id;
-
- if (ios->pages && !per_dev->length)
- return 0; /* Just an empty slot */
-
- first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(_ios_od(ios, first_dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- if (per_dev->cur_sg) {
- /* finalize the last sg_entry */
- _ore_add_sg_seg(per_dev, 0, false);
- if (unlikely(!per_dev->cur_sg))
- return 0; /* Skip parity only device */
-
- osd_req_read_sg(or, obj, per_dev->bio,
- per_dev->sglist, per_dev->cur_sg);
- } else {
- /* The no raid case */
- osd_req_read(or, obj, per_dev->offset,
- per_dev->bio, per_dev->length);
- }
-
- ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d sg_len=%d\n", _LLU(obj->id),
- _LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev, per_dev->cur_sg);
- } else {
- BUG_ON(ios->kern_buff);
-
- osd_req_get_attributes(or, obj);
- ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
- _LLU(obj->id),
- ios->in_attr_len, first_dev);
- }
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
-
- return 0;
-}
-
-int ore_read(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_read);
-
-int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(ios->per_dev[0].or,
- &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-EXPORT_SYMBOL(extract_attr_from_ios);
-
-static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
- struct osd_attr *attr)
-{
- int last_comp = cur_comp + ios->layout->mirrors_p1;
-
- for (; cur_comp < last_comp; ++cur_comp) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, cur_comp));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- osd_req_add_set_attr_list(or, attr, 1);
- }
-
- return 0;
-}
-
-struct _trunc_info {
- struct ore_striping_info si;
- u64 prev_group_obj_off;
- u64 next_group_obj_off;
-
- unsigned first_group_dev;
- unsigned nex_group_dev;
-};
-
-static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
- struct _trunc_info *ti)
-{
- unsigned stripe_unit = layout->stripe_unit;
-
- ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
-
- ti->prev_group_obj_off = ti->si.M * stripe_unit;
- ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
-
- ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
- ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-}
-
-int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
- u64 size)
-{
- struct ore_io_state *ios;
- struct exofs_trunc_attr {
- struct osd_attr attr;
- __be64 newsize;
- } *size_attrs;
- struct _trunc_info ti;
- int i, ret;
-
- ret = ore_get_io_state(layout, oc, &ios);
- if (unlikely(ret))
- return ret;
-
- _calc_trunk_info(ios->layout, size, &ti);
-
- size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
- GFP_KERNEL);
- if (unlikely(!size_attrs)) {
- ret = -ENOMEM;
- goto out;
- }
-
- ios->numdevs = ios->oc->numdevs;
-
- for (i = 0; i < ios->numdevs; ++i) {
- struct exofs_trunc_attr *size_attr = &size_attrs[i];
- u64 obj_size;
-
- if (i < ti.first_group_dev)
- obj_size = ti.prev_group_obj_off;
- else if (i >= ti.nex_group_dev)
- obj_size = ti.next_group_obj_off;
- else if (i < ti.si.dev) /* dev within this group */
- obj_size = ti.si.obj_offset +
- ios->layout->stripe_unit - ti.si.unit_off;
- else if (i == ti.si.dev)
- obj_size = ti.si.obj_offset;
- else /* i > ti.dev */
- obj_size = ti.si.obj_offset - ti.si.unit_off;
-
- size_attr->newsize = cpu_to_be64(obj_size);
- size_attr->attr = g_attr_logical_length;
- size_attr->attr.val_ptr = &size_attr->newsize;
-
- ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(oc->comps->obj.id), _LLU(obj_size), i);
- ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
- &size_attr->attr);
- if (unlikely(ret))
- goto out;
- }
- ret = ore_io_execute(ios);
-
-out:
- kfree(size_attrs);
- ore_put_io_state(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_truncate);
-
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
deleted file mode 100644
index 199590f36203..000000000000
--- a/fs/exofs/ore_raid.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Copyright (C) 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <linux/gfp.h>
-#include <linux/async_tx.h>
-
-#include "ore_raid.h"
-
-#undef ORE_DBGMSG2
-#define ORE_DBGMSG2 ORE_DBGMSG
-
-static struct page *_raid_page_alloc(void)
-{
- return alloc_page(GFP_KERNEL);
-}
-
-static void _raid_page_free(struct page *p)
-{
- __free_page(p);
-}
-
-/* This struct is forward declare in ore_io_state, but is private to here.
- * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
- *
- * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
- * Ascending page index access is sp2d(p-minor, c-major). But storage is
- * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
- * API.
- */
-struct __stripe_pages_2d {
- /* Cache some hot path repeated calculations */
- unsigned parity;
- unsigned data_devs;
- unsigned pages_in_unit;
-
- bool needed ;
-
- /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
- struct __1_page_stripe {
- bool alloc;
- unsigned write_count;
- struct async_submit_ctl submit;
- struct dma_async_tx_descriptor *tx;
-
- /* The size of this array is data_devs + parity */
- struct page **pages;
- struct page **scribble;
- /* bool array, size of this array is data_devs */
- char *page_is_read;
- } _1p_stripes[];
-};
-
-/* This can get bigger then a page. So support multiple page allocations
- * _sp2d_free should be called even if _sp2d_alloc fails (by returning
- * none-zero).
- */
-static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
- unsigned parity, struct __stripe_pages_2d **psp2d)
-{
- struct __stripe_pages_2d *sp2d;
- unsigned data_devs = group_width - parity;
-
- /*
- * Desired allocation layout is, though when larger than PAGE_SIZE,
- * each struct __alloc_1p_arrays is separately allocated:
-
- struct _alloc_all_bytes {
- struct __alloc_stripe_pages_2d {
- struct __stripe_pages_2d sp2d;
- struct __1_page_stripe _1p_stripes[pages_in_unit];
- } __asp2d;
- struct __alloc_1p_arrays {
- struct page *pages[group_width];
- struct page *scribble[group_width];
- char page_is_read[data_devs];
- } __a1pa[pages_in_unit];
- } *_aab;
-
- struct __alloc_1p_arrays *__a1pa;
- struct __alloc_1p_arrays *__a1pa_end;
-
- */
-
- char *__a1pa;
- char *__a1pa_end;
-
- const size_t sizeof_stripe_pages_2d =
- sizeof(struct __stripe_pages_2d) +
- sizeof(struct __1_page_stripe) * pages_in_unit;
- const size_t sizeof__a1pa =
- ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs,
- sizeof(void *));
- const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit;
- const size_t alloc_total = sizeof_stripe_pages_2d +
- sizeof__a1pa_arrays;
-
- unsigned num_a1pa, alloc_size, i;
-
- /* FIXME: check these numbers in ore_verify_layout */
- BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE);
- BUG_ON(sizeof__a1pa > PAGE_SIZE);
-
- /*
- * If alloc_total would be larger than PAGE_SIZE, only allocate
- * as many a1pa items as would fill the rest of the page, instead
- * of the full pages_in_unit count.
- */
- if (alloc_total > PAGE_SIZE) {
- num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa;
- alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa;
- } else {
- num_a1pa = pages_in_unit;
- alloc_size = alloc_total;
- }
-
- *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!sp2d)) {
- ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
- return -ENOMEM;
- }
- /* From here Just call _sp2d_free */
-
- /* Find start of a1pa area. */
- __a1pa = (char *)sp2d + sizeof_stripe_pages_2d;
- /* Find end of the _allocated_ a1pa area. */
- __a1pa_end = __a1pa + alloc_size;
-
- /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */
- for (i = 0; i < pages_in_unit; ++i) {
- struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i];
-
- if (unlikely(__a1pa >= __a1pa_end)) {
- num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
- pages_in_unit - i);
- alloc_size = sizeof__a1pa * num_a1pa;
-
- __a1pa = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!__a1pa)) {
- ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
- num_a1pa);
- return -ENOMEM;
- }
- __a1pa_end = __a1pa + alloc_size;
- /* First *pages is marked for kfree of the buffer */
- stripe->alloc = true;
- }
-
- /*
- * Attach all _lp_stripes pointers to the allocation for
- * it which was either part of the original PAGE_SIZE
- * allocation or the subsequent allocation in this loop.
- */
- stripe->pages = (void *)__a1pa;
- stripe->scribble = stripe->pages + group_width;
- stripe->page_is_read = (char *)stripe->scribble + group_width;
- __a1pa += sizeof__a1pa;
- }
-
- sp2d->parity = parity;
- sp2d->data_devs = data_devs;
- sp2d->pages_in_unit = pages_in_unit;
- return 0;
-}
-
-static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
- const struct _ore_r4w_op *r4w, void *priv)
-{
- unsigned data_devs = sp2d->data_devs;
- unsigned group_width = data_devs + sp2d->parity;
- int p, c;
-
- if (!sp2d->needed)
- return;
-
- for (c = data_devs - 1; c >= 0; --c)
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->page_is_read[c]) {
- struct page *page = _1ps->pages[c];
-
- r4w->put_page(priv, page);
- _1ps->page_is_read[c] = false;
- }
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
- _1ps->write_count = 0;
- _1ps->tx = NULL;
- }
-
- sp2d->needed = false;
-}
-
-static void _sp2d_free(struct __stripe_pages_2d *sp2d)
-{
- unsigned i;
-
- if (!sp2d)
- return;
-
- for (i = 0; i < sp2d->pages_in_unit; ++i) {
- if (sp2d->_1p_stripes[i].alloc)
- kfree(sp2d->_1p_stripes[i].pages);
- }
-
- kfree(sp2d);
-}
-
-static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
-{
- int p;
-
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
- unsigned tx_flags = ASYNC_TX_ACK;
-
- if (sp2d->parity == 1)
- tx_flags |= ASYNC_TX_XOR_ZERO_DST;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (!_1ps->write_count)
- continue;
-
- init_async_submit(&_1ps->submit, tx_flags,
- NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
-
- if (sp2d->parity == 1)
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
- _1ps->pages, 0, sp2d->data_devs,
- PAGE_SIZE, &_1ps->submit);
- else /* parity == 2 */
- _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
- sp2d->data_devs + sp2d->parity,
- PAGE_SIZE, &_1ps->submit);
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- /* NOTE: We wait for HW synchronously (I don't have such HW
- * to test with.) Is parallelism needed with today's multi
- * cores?
- */
- async_tx_issue_pending(_1ps->tx);
- }
-}
-
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- struct __1_page_stripe *_1ps;
-
- sp2d->needed = true;
-
- _1ps = &sp2d->_1p_stripes[si->cur_pg];
- _1ps->pages[si->cur_comp] = page;
- ++_1ps->write_count;
-
- si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
- /* si->cur_comp is advanced outside at main loop */
-}
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last)
-{
- struct osd_sg_entry *sge;
-
- ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
- "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
- per_dev->dev, cur_len, not_last, per_dev->cur_sg,
- _LLU(per_dev->offset), per_dev->length,
- per_dev->last_sgs_total);
-
- if (!per_dev->cur_sg) {
- sge = per_dev->sglist;
-
- /* First time we prepare two entries */
- if (per_dev->length) {
- ++per_dev->cur_sg;
- sge->offset = per_dev->offset;
- sge->len = per_dev->length;
- } else {
- /* Here the parity is the first unit of this object.
- * This happens every time we reach a parity device on
- * the same stripe as the per_dev->offset. We need to
- * just skip this unit.
- */
- per_dev->offset += cur_len;
- return;
- }
- } else {
- /* finalize the last one */
- sge = &per_dev->sglist[per_dev->cur_sg - 1];
- sge->len = per_dev->length - per_dev->last_sgs_total;
- }
-
- if (not_last) {
- /* Partly prepare the next one */
- struct osd_sg_entry *next_sge = sge + 1;
-
- ++per_dev->cur_sg;
- next_sge->offset = sge->offset + sge->len + cur_len;
- /* Save cur len so we know how mutch was added next time */
- per_dev->last_sgs_total = per_dev->length;
- next_sge->len = 0;
- } else if (!sge->len) {
- /* Optimize for when the last unit is a parity */
- --per_dev->cur_sg;
- }
-}
-
-static int _alloc_read_4_write(struct ore_io_state *ios)
-{
- struct ore_layout *layout = ios->layout;
- int ret;
- /* We want to only read those pages not in cache so worst case
- * is a stripe populated with every other page
- */
- unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
-
- ret = _ore_get_io_state(layout, ios->oc,
- layout->group_width * layout->mirrors_p1,
- sgs_per_dev, 0, &ios->ios_read_4_write);
- return ret;
-}
-
-/* @si contains info of the to-be-inserted page. Update of @si should be
- * maintained by caller. Specificaly si->dev, si->obj_offset, ...
- */
-static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
- struct page *page, unsigned pg_len)
-{
- struct request_queue *q;
- struct ore_per_dev_state *per_dev;
- struct ore_io_state *read_ios;
- unsigned first_dev = si->dev - (si->dev %
- (ios->layout->group_width * ios->layout->mirrors_p1));
- unsigned comp = si->dev - first_dev;
- unsigned added_len;
-
- if (!ios->ios_read_4_write) {
- int ret = _alloc_read_4_write(ios);
-
- if (unlikely(ret))
- return ret;
- }
-
- read_ios = ios->ios_read_4_write;
- read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
-
- per_dev = &read_ios->per_dev[comp];
- if (!per_dev->length) {
- per_dev->bio = bio_kmalloc(GFP_KERNEL,
- ios->sp2d->pages_in_unit);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- ios->sp2d->pages_in_unit);
- return -ENOMEM;
- }
- per_dev->offset = si->obj_offset;
- per_dev->dev = si->dev;
- } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
- u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
-
- _ore_add_sg_seg(per_dev, gap, true);
- }
- q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
- added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
- si->obj_offset % PAGE_SIZE);
- if (unlikely(added_len != pg_len)) {
- ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
- per_dev->bio->bi_vcnt);
- return -ENOMEM;
- }
-
- per_dev->length += pg_len;
- return 0;
-}
-
-/* read the beginning of an unaligned first page */
-static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
-{
- struct ore_striping_info si;
- unsigned pg_len;
-
- ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
-
- pg_len = si.obj_offset % PAGE_SIZE;
- si.obj_offset -= pg_len;
-
- ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
- _LLU(si.obj_offset), pg_len, page->index, si.dev);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-/* read the end of an incomplete last page */
-static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
-{
- struct ore_striping_info si;
- struct page *page;
- unsigned pg_len, p, c;
-
- ore_calc_stripe_info(ios->layout, *offset, 0, &si);
-
- p = si.cur_pg;
- c = si.cur_comp;
- page = ios->sp2d->_1p_stripes[p].pages[c];
-
- pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
- *offset += pg_len;
-
- ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
- p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
-
- BUG_ON(!page);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
-{
- struct bio_vec *bv;
- unsigned i, d;
-
- /* loop on all devices all pages */
- for (d = 0; d < ios->numdevs; d++) {
- struct bio *bio = ios->per_dev[d].bio;
-
- if (!bio)
- continue;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- }
- }
-}
-
-/* read_4_write is hacked to read the start of the first stripe and/or
- * the end of the last stripe. If needed, with an sg-gap at each device/page.
- * It is assumed to be called after the to_be_written pages of the first stripe
- * are populating ios->sp2d[][]
- *
- * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
- * These pages are held at sp2d[p].pages[c] but with
- * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
- * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
- * @uptodate=true, so we don't need to read it, only unlock, after IO.
- *
- * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
- * to-be-written count, we should consider the xor-in-place mode.
- * need_to_read_pages_count is the actual number of pages not present in cache.
- * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
- * approximation? In this mode the read pages are put in the empty places of
- * ios->sp2d[p][*], xor is calculated the same way. These pages are
- * allocated/freed and don't go through cache
- */
-static int _read_4_write_first_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset = ios->si.first_stripe_start;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- if (offset == ios->offset) /* Go to start collect $200 */
- goto read_last_stripe;
-
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
-
- ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
- offset, ios->offset, min_p, max_p);
-
- for (c = 0; ; c++) {
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- read_si.obj_offset += min_p * PAGE_SIZE;
- offset += min_p * PAGE_SIZE;
- for (p = min_p; p <= max_p; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- struct page **pp = &_1ps->pages[c];
- bool uptodate;
-
- if (*pp) {
- if (ios->offset % PAGE_SIZE)
- /* Read the remainder of the page */
- _add_to_r4w_first_page(ios, *pp);
- /* to-be-written pages start here */
- goto read_last_stripe;
- }
-
- *pp = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!*pp))
- return -ENOMEM;
-
- if (!uptodate)
- _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
-
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- read_si.obj_offset += PAGE_SIZE;
- offset += PAGE_SIZE;
- }
- offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
- }
-
-read_last_stripe:
- return 0;
-}
-
-static int _read_4_write_last_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset;
- u64 last_stripe_end;
- unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- offset = ios->offset + ios->length;
- if (offset % PAGE_SIZE)
- _add_to_r4w_last_page(ios, &offset);
- /* offset will be aligned to next page */
-
- last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
- * bytes_in_stripe;
- if (offset == last_stripe_end) /* Optimize for the aligned case */
- goto read_it;
-
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.cur_pg;
- c = read_si.cur_comp;
-
- if (min_p == sp2d->pages_in_unit) {
- /* Didn't do it yet */
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
- }
-
- ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
- offset, last_stripe_end, min_p, max_p);
-
- while (offset < last_stripe_end) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if ((min_p <= p) && (p <= max_p)) {
- struct page *page;
- bool uptodate;
-
- BUG_ON(_1ps->pages[c]);
- page = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!page))
- return -ENOMEM;
-
- _1ps->pages[c] = page;
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- if (!uptodate)
- _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
- }
-
- offset += PAGE_SIZE;
- if (p == (sp2d->pages_in_unit - 1)) {
- ++c;
- p = 0;
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- } else {
- read_si.obj_offset += PAGE_SIZE;
- ++p;
- }
- }
-
-read_it:
- return 0;
-}
-
-static int _read_4_write_execute(struct ore_io_state *ios)
-{
- struct ore_io_state *ios_read;
- unsigned i;
- int ret;
-
- ios_read = ios->ios_read_4_write;
- if (!ios_read)
- return 0;
-
- /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
- * to check for per_dev->bio
- */
- ios_read->pages = ios->pages;
-
- /* Now read these devices */
- for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios_read, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios_read); /* Synchronus execution */
- if (unlikely(ret)) {
- ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
- return ret;
- }
-
- _mark_read4write_pages_uptodate(ios_read, ret);
- ore_put_io_state(ios_read);
- ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
- return 0;
-}
-
-/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
-int _ore_add_parity_unit(struct ore_io_state *ios,
- struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev,
- unsigned cur_len, bool do_xor)
-{
- if (ios->reading) {
- if (per_dev->cur_sg >= ios->sgs_per_dev) {
- ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
- per_dev->cur_sg, ios->sgs_per_dev);
- return -ENOMEM;
- }
- _ore_add_sg_seg(per_dev, cur_len, true);
- } else {
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- struct page **pages = ios->parity_pages + ios->cur_par_page;
- unsigned num_pages;
- unsigned array_start = 0;
- unsigned i;
- int ret;
-
- si->cur_pg = _sp2d_min_pg(sp2d);
- num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
-
- if (!per_dev->length) {
- per_dev->offset += si->cur_pg * PAGE_SIZE;
- /* If first stripe, Read in all read4write pages
- * (if needed) before we calculate the first parity.
- */
- if (do_xor)
- _read_4_write_first_stripe(ios);
- }
- if (!cur_len && do_xor)
- /* If last stripe r4w pages of last stripe */
- _read_4_write_last_stripe(ios);
- _read_4_write_execute(ios);
-
- for (i = 0; i < num_pages; i++) {
- pages[i] = _raid_page_alloc();
- if (unlikely(!pages[i]))
- return -ENOMEM;
-
- ++(ios->cur_par_page);
- }
-
- BUG_ON(si->cur_comp < sp2d->data_devs);
- BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
-
- ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
- per_dev, num_pages * PAGE_SIZE);
- if (unlikely(ret))
- return ret;
-
- if (do_xor) {
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
- }
- }
- return 0;
-}
-
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->parity_pages) {
- struct ore_layout *layout = ios->layout;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-
- if (_sp2d_alloc(pages_in_unit, layout->group_width,
- layout->parity, &ios->sp2d)) {
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-void _ore_free_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->sp2d) { /* writing and raid */
- unsigned i;
-
- for (i = 0; i < ios->cur_par_page; i++) {
- struct page *page = ios->parity_pages[i];
-
- if (page)
- _raid_page_free(page);
- }
- if (ios->extra_part_alloc)
- kfree(ios->parity_pages);
- /* If IO returned an error pages might need unlocking */
- _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
- _sp2d_free(ios->sp2d);
- } else {
- /* Will only be set if raid reading && sglist is big */
- if (ios->extra_part_alloc)
- kfree(ios->per_dev[0].sglist);
- }
- if (ios->ios_read_4_write)
- ore_put_io_state(ios->ios_read_4_write);
-}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
deleted file mode 100644
index a6e746775570..000000000000
--- a/fs/exofs/ore_raid.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) from 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
-
-/* ios_raid.c stuff needed by ios.c */
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
-void _ore_free_raid_stuff(struct ore_io_state *ios);
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last);
-int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool do_xor);
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page);
-static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- if (!sp2d) /* Inline the fast path */
- return; /* Hay no raid stuff */
- _ore_add_stripe_page(sp2d, si, page);
-}
-
-/* ios.c stuff needed by ios_raid.c */
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios);
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len);
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
-int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
deleted file mode 100644
index fc80c7233fa5..000000000000
--- a/fs/exofs/super.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/string.h>
-#include <linux/parser.h>
-#include <linux/vfs.h>
-#include <linux/random.h>
-#include <linux/module.h>
-#include <linux/exportfs.h>
-#include <linux/slab.h>
-#include <linux/iversion.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-/******************************************************************************
- * MOUNT OPTIONS
- *****************************************************************************/
-
-/*
- * struct to hold what we get from mount options
- */
-struct exofs_mountopt {
- bool is_osdname;
- const char *dev_name;
- uint64_t pid;
- int timeout;
-};
-
-/*
- * exofs-specific mount-time options.
- */
-enum { Opt_name, Opt_pid, Opt_to, Opt_err };
-
-/*
- * Our mount-time options. These should ideally be 64-bit unsigned, but the
- * kernel's parsing functions do not currently support that. 32-bit should be
- * sufficient for most applications now.
- */
-static match_table_t tokens = {
- {Opt_name, "osdname=%s"},
- {Opt_pid, "pid=%u"},
- {Opt_to, "to=%u"},
- {Opt_err, NULL}
-};
-
-/*
- * The main option parsing method. Also makes sure that all of the mandatory
- * mount options were set.
- */
-static int parse_options(char *options, struct exofs_mountopt *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- bool s_pid = false;
-
- EXOFS_DBGMSG("parse_options %s\n", options);
- /* defaults */
- memset(opts, 0, sizeof(*opts));
- opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- char str[32];
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_name:
- kfree(opts->dev_name);
- opts->dev_name = match_strdup(&args[0]);
- if (unlikely(!opts->dev_name)) {
- EXOFS_ERR("Error allocating dev_name");
- return -ENOMEM;
- }
- opts->is_osdname = true;
- break;
- case Opt_pid:
- if (0 == match_strlcpy(str, &args[0], sizeof(str)))
- return -EINVAL;
- opts->pid = simple_strtoull(str, NULL, 0);
- if (opts->pid < EXOFS_MIN_PID) {
- EXOFS_ERR("Partition ID must be >= %u",
- EXOFS_MIN_PID);
- return -EINVAL;
- }
- s_pid = true;
- break;
- case Opt_to:
- if (match_int(&args[0], &option))
- return -EINVAL;
- if (option <= 0) {
- EXOFS_ERR("Timeout must be > 0");
- return -EINVAL;
- }
- opts->timeout = option * HZ;
- break;
- }
- }
-
- if (!s_pid) {
- EXOFS_ERR("Need to specify the following options:\n");
- EXOFS_ERR(" -o pid=pid_no_to_use\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-/******************************************************************************
- * INODE CACHE
- *****************************************************************************/
-
-/*
- * Our inode cache. Isn't it pretty?
- */
-static struct kmem_cache *exofs_inode_cachep;
-
-/*
- * Allocate an inode in the cache
- */
-static struct inode *exofs_alloc_inode(struct super_block *sb)
-{
- struct exofs_i_info *oi;
-
- oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
- if (!oi)
- return NULL;
-
- inode_set_iversion(&oi->vfs_inode, 1);
- return &oi->vfs_inode;
-}
-
-static void exofs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
-}
-
-/*
- * Remove an inode from the cache
- */
-static void exofs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, exofs_i_callback);
-}
-
-/*
- * Initialize the inode
- */
-static void exofs_init_once(void *foo)
-{
- struct exofs_i_info *oi = foo;
-
- inode_init_once(&oi->vfs_inode);
-}
-
-/*
- * Create and initialize the inode cache
- */
-static int init_inodecache(void)
-{
- exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache",
- sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT,
- offsetof(struct exofs_i_info, i_data),
- sizeof_field(struct exofs_i_info, i_data),
- exofs_init_once);
- if (exofs_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Destroy the inode cache
- */
-static void destroy_inodecache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(exofs_inode_cachep);
-}
-
-/******************************************************************************
- * Some osd helpers
- *****************************************************************************/
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
- u64 offset, void *p, unsigned length)
-{
- struct osd_request *or = osd_start_request(od);
-/* struct osd_sense_info osi = {.key = 0};*/
- int ret;
-
- if (unlikely(!or)) {
- EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
- return -ENOMEM;
- }
- ret = osd_req_read_kern(or, obj, offset, p, length);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
- goto out;
- }
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
- goto out;
- }
-
- ret = osd_execute_request(or);
- if (unlikely(ret))
- EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
- /* osd_req_decode_sense(or, ret); */
-
-out:
- osd_end_request(or);
- EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%p ret=>%d\n",
- _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
- return ret;
-}
-
-static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
- EXOFS_APAGE_SB_DATA,
- EXOFS_ATTR_SB_STATS,
- sizeof(struct exofs_sb_stats));
-
-static int __sbi_read_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("Error reading super_block stats => %d\n", ret);
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
- goto out;
- }
- if (attrs[0].len) {
- struct exofs_sb_stats *ess;
-
- if (unlikely(attrs[0].len != sizeof(*ess))) {
- EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
- "size(%d) != expected(%zd)\n",
- __func__, attrs[0].len, sizeof(*ess));
- goto out;
- }
-
- ess = attrs[0].val_ptr;
- sbi->s_nextid = le64_to_cpu(ess->s_nextid);
- sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void stats_done(struct ore_io_state *ios, void *p)
-{
- ore_put_io_state(ios);
- /* Good thanks nothing to do anymore */
-}
-
-/* Asynchronously write the stats attribute */
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
- sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
- attrs[0].val_ptr = &sbi->s_ess;
-
-
- ios->done = stats_done;
- ios->private = sbi;
- ios->out_attr = attrs;
- ios->out_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
- ore_put_io_state(ios);
- }
-
- return ret;
-}
-
-/******************************************************************************
- * SUPERBLOCK FUNCTIONS
- *****************************************************************************/
-static const struct super_operations exofs_sops;
-static const struct export_operations exofs_export_ops;
-
-/*
- * Write the superblock to the OSD
- */
-static int exofs_sync_fs(struct super_block *sb, int wait)
-{
- struct exofs_sb_info *sbi;
- struct exofs_fscb *fscb;
- struct ore_comp one_comp;
- struct ore_components oc;
- struct ore_io_state *ios;
- int ret = -ENOMEM;
-
- fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
- if (unlikely(!fscb))
- return -ENOMEM;
-
- sbi = sb->s_fs_info;
-
- /* NOTE: We no longer dirty the super_block anywhere in exofs. The
- * reason we write the fscb here on unmount is so we can stay backwards
- * compatible with fscb->s_version == 1. (What we are not compatible
- * with is if a new version FS crashed and then we try to mount an old
- * version). Otherwise the exofs_fscb is read-only from mkfs time. All
- * the writeable info is set in exofs_sbi_write_stats() above.
- */
-
- exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
-
- ret = ore_get_io_state(&sbi->layout, &oc, &ios);
- if (unlikely(ret))
- goto out;
-
- ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
- memset(fscb, 0, ios->length);
- fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
- fscb->s_magic = cpu_to_le16(sb->s_magic);
- fscb->s_newfs = 0;
- fscb->s_version = EXOFS_FSCB_VER;
-
- ios->offset = 0;
- ios->kern_buff = fscb;
-
- ret = ore_write(ios);
- if (unlikely(ret))
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
-
-out:
- EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
- ore_put_io_state(ios);
- kfree(fscb);
- return ret;
-}
-
-static void _exofs_print_device(const char *msg, const char *dev_path,
- struct osd_dev *od, u64 pid)
-{
- const struct osd_dev_info *odi = osduld_device_info(od);
-
- printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
- msg, dev_path ?: "", odi->osdname, _LLU(pid));
-}
-
-static void exofs_free_sbi(struct exofs_sb_info *sbi)
-{
- unsigned numdevs = sbi->oc.numdevs;
-
- while (numdevs) {
- unsigned i = --numdevs;
- struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
-
- if (od) {
- ore_comp_set_dev(&sbi->oc, i, NULL);
- osduld_put_device(od);
- }
- }
- kfree(sbi->oc.ods);
- kfree(sbi);
-}
-
-/*
- * This function is called when the vfs is freeing the superblock. We just
- * need to free our own part.
- */
-static void exofs_put_super(struct super_block *sb)
-{
- int num_pend;
- struct exofs_sb_info *sbi = sb->s_fs_info;
-
- /* make sure there are no pending commands */
- for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
- num_pend = atomic_read(&sbi->s_curr_pending)) {
- wait_queue_head_t wq;
-
- printk(KERN_NOTICE "%s: !!Pending operations in flight. "
- "This is a BUG. please report to osd-dev@open-osd.org\n",
- __func__);
- init_waitqueue_head(&wq);
- wait_event_timeout(wq,
- (atomic_read(&sbi->s_curr_pending) == 0),
- msecs_to_jiffies(100));
- }
-
- _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
- sbi->one_comp.obj.partition);
-
- exofs_sysfs_sb_del(sbi);
- exofs_free_sbi(sbi);
- sb->s_fs_info = NULL;
-}
-
-static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_device_table *dt)
-{
- int ret;
-
- sbi->layout.stripe_unit =
- le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->layout.group_width =
- le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->layout.group_depth =
- le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->layout.mirrors_p1 =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
- sbi->layout.raid_algorithm =
- le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-
- ret = ore_verify_layout(numdevs, &sbi->layout);
-
- EXOFS_DBGMSG("exofs: layout: "
- "num_comps=%u stripe_unit=0x%x group_width=%u "
- "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
- numdevs,
- sbi->layout.stripe_unit,
- sbi->layout.group_width,
- _LLU(sbi->layout.group_depth),
- sbi->layout.mirrors_p1,
- sbi->layout.raid_algorithm);
- return ret;
-}
-
-static unsigned __ra_pages(struct ore_layout *layout)
-{
- const unsigned _MIN_RA = 32; /* min 128K read-ahead */
- unsigned ra_pages = layout->group_width * layout->stripe_unit /
- PAGE_SIZE;
- unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
-
- ra_pages *= 2; /* two stripes */
- if (ra_pages < _MIN_RA)
- ra_pages = roundup(_MIN_RA, ra_pages / 2);
-
- if (ra_pages > max_io_pages)
- ra_pages = max_io_pages;
-
- return ra_pages;
-}
-
-/* @odi is valid only as long as @fscb_dev is valid */
-static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
- struct osd_dev_info *odi)
-{
- odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- if (likely(odi->systemid_len))
- memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
-
- odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
- odi->osdname = dt_dev->osdname;
-
- /* FIXME support long names. Will need a _put function */
- if (dt_dev->long_name_offset)
- return -EINVAL;
-
- /* Make sure osdname is printable!
- * mkexofs should give us space for a null-terminator else the
- * device-table is invalid.
- */
- if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
- odi->osdname_len = sizeof(dt_dev->osdname) - 1;
- dt_dev->osdname[odi->osdname_len] = 0;
-
- /* If it's all zeros something is bad we read past end-of-obj */
- return !(odi->systemid_len || odi->osdname_len);
-}
-
-static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_dev **peds)
-{
- /* Twice bigger table: See exofs_init_comps() and comment at
- * exofs_read_lookup_dev_table()
- */
- const size_t numores = numdevs * 2 - 1;
- struct exofs_dev *eds;
- unsigned i;
-
- sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) +
- numdevs * sizeof(struct exofs_dev), GFP_KERNEL);
- if (unlikely(!sbi->oc.ods)) {
- EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
- numdevs);
- return -ENOMEM;
- }
-
- /* Start of allocated struct exofs_dev entries */
- *peds = eds = (void *)sbi->oc.ods[numores];
- /* Initialize pointers into struct exofs_dev */
- for (i = 0; i < numdevs; ++i)
- sbi->oc.ods[i] = &eds[i].ored;
- return 0;
-}
-
-static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
- struct osd_dev *fscb_od,
- unsigned table_count)
-{
- struct ore_comp comp;
- struct exofs_device_table *dt;
- struct exofs_dev *eds;
- unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
- sizeof(*dt);
- unsigned numdevs, i;
- int ret;
-
- dt = kmalloc(table_bytes, GFP_KERNEL);
- if (unlikely(!dt)) {
- EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
- table_bytes);
- return -ENOMEM;
- }
-
- sbi->oc.numdevs = 0;
-
- comp.obj.partition = sbi->one_comp.obj.partition;
- comp.obj.id = EXOFS_DEVTABLE_ID;
- exofs_make_credential(comp.cred, &comp.obj);
-
- ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
- table_bytes);
- if (unlikely(ret)) {
- EXOFS_ERR("ERROR: reading device table\n");
- goto out;
- }
-
- numdevs = le64_to_cpu(dt->dt_num_devices);
- if (unlikely(!numdevs)) {
- ret = -EINVAL;
- goto out;
- }
- WARN_ON(table_count != numdevs);
-
- ret = _read_and_match_data_map(sbi, numdevs, dt);
- if (unlikely(ret))
- goto out;
-
- ret = __alloc_dev_table(sbi, numdevs, &eds);
- if (unlikely(ret))
- goto out;
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
- (numdevs - 1) * sizeof(sbi->oc.ods[0]));
-
- /* create sysfs subdir under which we put the device table
- * And cluster layout. A Superblock is identified by the string:
- * "dev[0].osdname"_"pid"
- */
- exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
-
- for (i = 0; i < numdevs; i++) {
- struct exofs_fscb fscb;
- struct osd_dev_info odi;
- struct osd_dev *od;
-
- if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
- EXOFS_ERR("ERROR: Read all-zeros device entry\n");
- ret = -EINVAL;
- goto out;
- }
-
- printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
- i, odi.osdname);
-
- /* the exofs id is currently the table index */
- eds[i].did = i;
-
- /* On all devices the device table is identical. The user can
- * specify any one of the participating devices on the command
- * line. We always keep them in device-table order.
- */
- if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- eds[i].ored.od = fscb_od;
- ++sbi->oc.numdevs;
- fscb_od = NULL;
- exofs_sysfs_odev_add(&eds[i], sbi);
- continue;
- }
-
- od = osduld_info_lookup(&odi);
- if (IS_ERR(od)) {
- ret = PTR_ERR(od);
- EXOFS_ERR("ERROR: device requested is not found "
- "osd_name-%s =>%d\n", odi.osdname, ret);
- goto out;
- }
-
- eds[i].ored.od = od;
- ++sbi->oc.numdevs;
-
- /* Read the fscb of the other devices to make sure the FS
- * partition is there.
- */
- ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
- sizeof(fscb));
- if (unlikely(ret)) {
- EXOFS_ERR("ERROR: Malformed participating device "
- "error reading fscb osd_name-%s\n",
- odi.osdname);
- goto out;
- }
- exofs_sysfs_odev_add(&eds[i], sbi);
-
- /* TODO: verify other information is correct and FS-uuid
- * matches. Benny what did you say about device table
- * generation and old devices?
- */
- }
-
-out:
- kfree(dt);
- if (unlikely(fscb_od && !ret)) {
- EXOFS_ERR("ERROR: Bad device-table container device not present\n");
- osduld_put_device(fscb_od);
- return -EINVAL;
- }
- return ret;
-}
-
-/*
- * Read the superblock from the OSD and fill in the fields
- */
-static int exofs_fill_super(struct super_block *sb,
- struct exofs_mountopt *opts,
- struct exofs_sb_info *sbi,
- int silent)
-{
- struct inode *root;
- struct osd_dev *od; /* Master device */
- struct exofs_fscb fscb; /*on-disk superblock info */
- struct ore_comp comp;
- unsigned table_count;
- int ret;
-
- /* use mount options to fill superblock */
- if (opts->is_osdname) {
- struct osd_dev_info odi = {.systemid_len = 0};
-
- odi.osdname_len = strlen(opts->dev_name);
- odi.osdname = (u8 *)opts->dev_name;
- od = osduld_info_lookup(&odi);
- kfree(opts->dev_name);
- opts->dev_name = NULL;
- } else {
- od = osduld_path_lookup(opts->dev_name);
- }
- if (IS_ERR(od)) {
- ret = -EINVAL;
- goto free_sbi;
- }
-
- /* Default layout in case we do not have a device-table */
- sbi->layout.stripe_unit = PAGE_SIZE;
- sbi->layout.mirrors_p1 = 1;
- sbi->layout.group_width = 1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- sbi->s_timeout = opts->timeout;
-
- sbi->one_comp.obj.partition = opts->pid;
- sbi->one_comp.obj.id = 0;
- exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->oc.single_comp = EC_SINGLE_COMP;
- sbi->oc.comps = &sbi->one_comp;
-
- /* fill in some other data by hand */
- memset(sb->s_id, 0, sizeof(sb->s_id));
- strcpy(sb->s_id, "exofs");
- sb->s_blocksize = EXOFS_BLKSIZE;
- sb->s_blocksize_bits = EXOFS_BLKSHIFT;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_max_links = EXOFS_LINK_MAX;
- atomic_set(&sbi->s_curr_pending, 0);
- sb->s_bdev = NULL;
- sb->s_dev = 0;
-
- comp.obj.partition = sbi->one_comp.obj.partition;
- comp.obj.id = EXOFS_SUPER_ID;
- exofs_make_credential(comp.cred, &comp.obj);
-
- ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
- if (unlikely(ret))
- goto free_sbi;
-
- sb->s_magic = le16_to_cpu(fscb.s_magic);
- /* NOTE: we read below to be backward compatible with old versions */
- sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
- sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
-
- /* make sure what we read from the object store is correct */
- if (sb->s_magic != EXOFS_SUPER_MAGIC) {
- if (!silent)
- EXOFS_ERR("ERROR: Bad magic value\n");
- ret = -EINVAL;
- goto free_sbi;
- }
- if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
- EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
- EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
- ret = -EINVAL;
- goto free_sbi;
- }
-
- /* start generation numbers from a random point */
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
- spin_lock_init(&sbi->s_next_gen_lock);
-
- table_count = le64_to_cpu(fscb.s_dev_table_count);
- if (table_count) {
- ret = exofs_read_lookup_dev_table(sbi, od, table_count);
- if (unlikely(ret))
- goto free_sbi;
- } else {
- struct exofs_dev *eds;
-
- ret = __alloc_dev_table(sbi, 1, &eds);
- if (unlikely(ret))
- goto free_sbi;
-
- ore_comp_set_dev(&sbi->oc, 0, od);
- sbi->oc.numdevs = 1;
- }
-
- __sbi_read_stats(sbi);
-
- /* set up operation vectors */
- ret = super_setup_bdi(sb);
- if (ret) {
- EXOFS_DBGMSG("Failed to super_setup_bdi\n");
- goto free_sbi;
- }
- sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
- sb->s_fs_info = sbi;
- sb->s_op = &exofs_sops;
- sb->s_export_op = &exofs_export_ops;
- root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
- if (IS_ERR(root)) {
- EXOFS_ERR("ERROR: exofs_iget failed\n");
- ret = PTR_ERR(root);
- goto free_sbi;
- }
- sb->s_root = d_make_root(root);
- if (!sb->s_root) {
- EXOFS_ERR("ERROR: get root inode failed\n");
- ret = -ENOMEM;
- goto free_sbi;
- }
-
- if (!S_ISDIR(root->i_mode)) {
- dput(sb->s_root);
- sb->s_root = NULL;
- EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
- root->i_mode);
- ret = -EINVAL;
- goto free_sbi;
- }
-
- exofs_sysfs_dbg_print();
- _exofs_print_device("Mounting", opts->dev_name,
- ore_comp_dev(&sbi->oc, 0),
- sbi->one_comp.obj.partition);
- return 0;
-
-free_sbi:
- EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
- opts->dev_name, sbi->one_comp.obj.partition, ret);
- exofs_free_sbi(sbi);
- return ret;
-}
-
-/*
- * Set up the superblock (calls exofs_fill_super eventually)
- */
-static struct dentry *exofs_mount(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
-{
- struct super_block *s;
- struct exofs_mountopt opts;
- struct exofs_sb_info *sbi;
- int ret;
-
- ret = parse_options(data, &opts);
- if (ret) {
- kfree(opts.dev_name);
- return ERR_PTR(ret);
- }
-
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi) {
- kfree(opts.dev_name);
- return ERR_PTR(-ENOMEM);
- }
-
- s = sget(type, NULL, set_anon_super, flags, NULL);
-
- if (IS_ERR(s)) {
- kfree(opts.dev_name);
- kfree(sbi);
- return ERR_CAST(s);
- }
-
- if (!opts.dev_name)
- opts.dev_name = dev_name;
-
-
- ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0);
- if (ret) {
- deactivate_locked_super(s);
- return ERR_PTR(ret);
- }
- s->s_flags |= SB_ACTIVE;
- return dget(s->s_root);
-}
-
-/*
- * Return information about the file system state in the buffer. This is used
- * by the 'df' command, for example.
- */
-static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- struct osd_attr attrs[] = {
- ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
- OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
- ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
- OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
- };
- uint64_t capacity = ULLONG_MAX;
- uint64_t used = ULLONG_MAX;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (ret) {
- EXOFS_DBGMSG("ore_get_io_state failed.\n");
- return ret;
- }
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret))
- goto out;
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (likely(!ret)) {
- capacity = get_unaligned_be64(attrs[0].val_ptr);
- if (unlikely(!capacity))
- capacity = ULLONG_MAX;
- } else
- EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-
- ret = extract_attr_from_ios(ios, &attrs[1]);
- if (likely(!ret))
- used = get_unaligned_be64(attrs[1].val_ptr);
- else
- EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
-
- /* fill in the stats buffer */
- buf->f_type = EXOFS_SUPER_MAGIC;
- buf->f_bsize = EXOFS_BLKSIZE;
- buf->f_blocks = capacity >> 9;
- buf->f_bfree = (capacity - used) >> 9;
- buf->f_bavail = buf->f_bfree;
- buf->f_files = sbi->s_numfiles;
- buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
- buf->f_namelen = EXOFS_NAME_LEN;
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static const struct super_operations exofs_sops = {
- .alloc_inode = exofs_alloc_inode,
- .destroy_inode = exofs_destroy_inode,
- .write_inode = exofs_write_inode,
- .evict_inode = exofs_evict_inode,
- .put_super = exofs_put_super,
- .sync_fs = exofs_sync_fs,
- .statfs = exofs_statfs,
-};
-
-/******************************************************************************
- * EXPORT OPERATIONS
- *****************************************************************************/
-
-static struct dentry *exofs_get_parent(struct dentry *child)
-{
- unsigned long ino = exofs_parent_ino(child);
-
- if (!ino)
- return ERR_PTR(-ESTALE);
-
- return d_obtain_alias(exofs_iget(child->d_sb, ino));
-}
-
-static struct inode *exofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- struct inode *inode;
-
- inode = exofs_iget(sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- /* we didn't find the right inode.. */
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
- return inode;
-}
-
-static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
- struct fid *fid, int fh_len, int fh_type)
-{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- exofs_nfs_get_inode);
-}
-
-static struct dentry *exofs_fh_to_parent(struct super_block *sb,
- struct fid *fid, int fh_len, int fh_type)
-{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- exofs_nfs_get_inode);
-}
-
-static const struct export_operations exofs_export_ops = {
- .fh_to_dentry = exofs_fh_to_dentry,
- .fh_to_parent = exofs_fh_to_parent,
- .get_parent = exofs_get_parent,
-};
-
-/******************************************************************************
- * INSMOD/RMMOD
- *****************************************************************************/
-
-/*
- * struct that describes this file system
- */
-static struct file_system_type exofs_type = {
- .owner = THIS_MODULE,
- .name = "exofs",
- .mount = exofs_mount,
- .kill_sb = generic_shutdown_super,
-};
-MODULE_ALIAS_FS("exofs");
-
-static int __init init_exofs(void)
-{
- int err;
-
- err = init_inodecache();
- if (err)
- goto out;
-
- err = register_filesystem(&exofs_type);
- if (err)
- goto out_d;
-
- /* We don't fail if sysfs creation failed */
- exofs_sysfs_init();
-
- return 0;
-out_d:
- destroy_inodecache();
-out:
- return err;
-}
-
-static void __exit exit_exofs(void)
-{
- exofs_sysfs_uninit();
- unregister_filesystem(&exofs_type);
- destroy_inodecache();
-}
-
-MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
-MODULE_DESCRIPTION("exofs");
-MODULE_LICENSE("GPL");
-
-module_init(init_exofs)
-module_exit(exit_exofs)
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
deleted file mode 100644
index 1f7d5e46cdda..000000000000
--- a/fs/exofs/sys.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (C) 2012
- * Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License 2 as published by
- * the Free Software Foundation.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the:
- * Free Software Foundation <licensing@fsf.org>
- */
-
-#include <linux/kobject.h>
-#include <linux/device.h>
-
-#include "exofs.h"
-
-struct odev_attr {
- struct attribute attr;
- ssize_t (*show)(struct exofs_dev *, char *);
- ssize_t (*store)(struct exofs_dev *, const char *, size_t);
-};
-
-static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
- char *buf)
-{
- struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
- struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
- return a->show ? a->show(edp, buf) : 0;
-}
-
-static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
-{
- struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
- struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
- return a->store ? a->store(edp, buf, len) : len;
-}
-
-static const struct sysfs_ops odev_attr_ops = {
- .show = odev_attr_show,
- .store = odev_attr_store,
-};
-
-
-static struct kset *exofs_kset;
-
-static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
-{
- struct osd_dev *odev = edp->ored.od;
- const struct osd_dev_info *odi = osduld_device_info(odev);
-
- return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
-}
-
-static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
-{
- struct osd_dev *odev = edp->ored.od;
- const struct osd_dev_info *odi = osduld_device_info(odev);
-
- memcpy(buf, odi->systemid, odi->systemid_len);
- return odi->systemid_len;
-}
-
-static ssize_t uri_show(struct exofs_dev *edp, char *buf)
-{
- return snprintf(buf, edp->urilen, "%s", edp->uri);
-}
-
-static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
-{
- uint8_t *new_uri;
-
- edp->urilen = strlen(buf) + 1;
- new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
- if (new_uri == NULL)
- return -ENOMEM;
- edp->uri = new_uri;
- strncpy(edp->uri, buf, edp->urilen);
- return edp->urilen;
-}
-
-#define OSD_ATTR(name, mode, show, store) \
- static struct odev_attr odev_attr_##name = \
- __ATTR(name, mode, show, store)
-
-OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
-OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
-OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
-
-static struct attribute *odev_attrs[] = {
- &odev_attr_osdname.attr,
- &odev_attr_systemid.attr,
- &odev_attr_uri.attr,
- NULL,
-};
-
-static struct kobj_type odev_ktype = {
- .default_attrs = odev_attrs,
- .sysfs_ops = &odev_attr_ops,
-};
-
-static struct kobj_type uuid_ktype = {
-};
-
-void exofs_sysfs_dbg_print(void)
-{
-#ifdef CONFIG_EXOFS_DEBUG
- struct kobject *k_name, *k_tmp;
-
- list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
- printk(KERN_INFO "%s: name %s ref %d\n",
- __func__, kobject_name(k_name),
- (int)kref_read(&k_name->kref));
- }
-#endif
-}
-/*
- * This function removes all kobjects under exofs_kset
- * At the end of it, exofs_kset kobject will have a refcount
- * of 1 which gets decremented only on exofs module unload
- */
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
-{
- struct kobject *k_name, *k_tmp;
- struct kobject *s_kobj = &sbi->s_kobj;
-
- list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
- /* Remove all that are children of this SBI */
- if (k_name->parent == s_kobj)
- kobject_put(k_name);
- }
- kobject_put(s_kobj);
-}
-
-/*
- * This function creates sysfs entries to hold the current exofs cluster
- * instance (uniquely identified by osdname,pid tuple).
- * This function gets called once per exofs mount instance.
- */
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
- struct exofs_dt_device_info *dt_dev)
-{
- struct kobject *s_kobj;
- int retval = 0;
- uint64_t pid = sbi->one_comp.obj.partition;
-
- /* allocate new uuid dirent */
- s_kobj = &sbi->s_kobj;
- s_kobj->kset = exofs_kset;
- retval = kobject_init_and_add(s_kobj, &uuid_ktype,
- &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid);
- if (retval) {
- EXOFS_ERR("ERROR: Failed to create sysfs entry for "
- "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
- return -ENOMEM;
- }
- return 0;
-}
-
-int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
-{
- struct kobject *d_kobj;
- int retval = 0;
-
- /* create osd device group which contains following attributes
- * osdname, systemid & uri
- */
- d_kobj = &edev->ed_kobj;
- d_kobj->kset = exofs_kset;
- retval = kobject_init_and_add(d_kobj, &odev_ktype,
- &sbi->s_kobj, "dev%u", edev->did);
- if (retval) {
- EXOFS_ERR("ERROR: Failed to create sysfs entry for "
- "device dev%u\n", edev->did);
- return retval;
- }
- return 0;
-}
-
-int exofs_sysfs_init(void)
-{
- exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
- if (!exofs_kset) {
- EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-void exofs_sysfs_uninit(void)
-{
- kset_unregister(exofs_kset);
-}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114def693..13318e255ebf 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
return (char *)p - base;
}
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
- [EXT2_FT_UNKNOWN] = DT_UNKNOWN,
- [EXT2_FT_REG_FILE] = DT_REG,
- [EXT2_FT_DIR] = DT_DIR,
- [EXT2_FT_CHRDEV] = DT_CHR,
- [EXT2_FT_BLKDEV] = DT_BLK,
- [EXT2_FT_FIFO] = DT_FIFO,
- [EXT2_FT_SOCK] = DT_SOCK,
- [EXT2_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK,
-};
-
static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
{
- umode_t mode = inode->i_mode;
if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
else
de->file_type = 0;
}
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
- unsigned char *types = NULL;
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
- if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- types = ext2_filetype_table;
+ has_filetype =
+ EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (de->inode) {
unsigned char d_type = DT_UNKNOWN;
- if (types && de->file_type < EXT2_FT_MAX)
- d_type = types[de->file_type];
+ if (has_filetype)
+ d_type = fs_ftype_to_dtype(de->file_type);
if (!dir_emit(ctx, de->name, de->name_len,
le32_to_cpu(de->inode),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e770cd100a6a..10ab238de9a6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -604,22 +604,6 @@ struct ext2_dir_entry_2 {
};
/*
- * Ext2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-enum {
- EXT2_FT_UNKNOWN = 0,
- EXT2_FT_REG_FILE = 1,
- EXT2_FT_DIR = 2,
- EXT2_FT_CHRDEV = 3,
- EXT2_FT_BLKDEV = 4,
- EXT2_FT_FIFO = 5,
- EXT2_FT_SOCK = 6,
- EXT2_FT_SYMLINK = 7,
- EXT2_FT_MAX
-};
-
-/*
* EXT2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
extern void ext2_set_inode_flags(struct inode *inode);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609f25c1..39c4772e96c9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7e4975..a0c5ea91fcd4 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
best_desc = desc;
}
}
- if (!best_desc)
- return -1;
return best_group;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb9386c045..c27c27300d95 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
/*
- * Next look up the indirect map to count the totoal number of
+ * Next look up the indirect map to count the total number of
* direct blocks to allocate for this branch.
*/
count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 1);
}
+ /* fall through */
case EXT2_IND_BLOCK:
nr = i_data[EXT2_DIND_BLOCK];
if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 2);
}
+ /* fall through */
case EXT2_DIND_BLOCK:
nr = i_data[EXT2_TIND_BLOCK];
if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
+int ext2_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_falgs)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ unsigned int flags;
+
+ flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+ if (flags & EXT2_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (flags & EXT2_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (flags & EXT2_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (flags & EXT2_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc5d850..ccfbbf59e2fc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73b2d528237f..0128010a0874 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits)
{
loff_t res = EXT2_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
+ unsigned int upper_limit;
+ unsigned int ppb = 1 << (bits-2);
/* This is calculated to be the largest file size for a
* dense, file such that the total number of
@@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits)
/* total blocks in file system block size */
upper_limit >>= (bits - 9);
+ /* Compute how many blocks we can address by block tree */
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ /* Does block tree limit file size? */
+ if (res < upper_limit)
+ goto check_lfs;
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT2_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+ DIV_ROUND_UP(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT2_INODE_SIZE(sb) == 0)
- goto cantfind_ext2;
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
goto cantfind_ext2;
@@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount;
}
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount_group_desc;
}
@@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT2_FS_XATTR
sbi->s_ea_block_cache = ext2_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589ddcc281..00cdb8679486 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,6 +23,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.get_link = page_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.get_link = simple_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 4f30876ee325..1e33e0ac8cf1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
return;
spin_lock(&EXT2_SB(sb)->s_lock);
+ ext2_update_dynamic_rev(sb);
EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
spin_unlock(&EXT2_SB(sb)->s_lock);
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc87082b..06f77ca7f36e 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -96,23 +96,8 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-config EXT4_ENCRYPTION
- bool "Ext4 Encryption"
- depends on EXT4_FS
- select FS_ENCRYPTION
- help
- Enable encryption of ext4 files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
- bool
- default y
- depends on EXT4_ENCRYPTION
-
config EXT4_DEBUG
- bool "EXT4 debugging support"
+ bool "Ext4 debugging support"
depends on EXT4_FS
help
Enables run-time debugging support for the ext4 filesystem.
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f93f9881ec18..0ccd51f72048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
return err;
@@ -138,7 +138,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
@@ -245,7 +245,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!ext4_encrypted_inode(inode)) {
+ if (!IS_ENCRYPTED(inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
le32_to_cpu(de->inode),
@@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
done:
err = 0;
errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fstr);
-#endif
brelse(bh);
return err;
}
@@ -613,7 +611,7 @@ finished:
static int ext4_dir_open(struct inode * inode, struct file * filp)
{
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 185a05d3257e..82ffdacdc7fa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,7 +40,6 @@
#include <linux/compat.h>
#endif
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include <linux/compiler.h>
@@ -426,6 +425,9 @@ struct flex_groups {
/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -1326,7 +1328,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
EXT4_MF_TEST_DUMMY_ENCRYPTION))
#else
@@ -1662,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+
#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
@@ -1670,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_compat |= \
cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
@@ -1687,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
@@ -1704,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_incompat |= \
cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
@@ -2051,7 +2058,7 @@ struct ext4_filename {
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
};
@@ -2279,12 +2286,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline bool ext4_encrypted_inode(struct inode *inode)
-{
- return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
-}
-
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
int lookup, struct ext4_filename *fname)
@@ -2672,7 +2674,6 @@ do { \
#endif
-extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
extern int ext4_update_rocompat_feature(handle_t *handle,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd733780..a1ac7e9245ec 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -411,7 +411,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240b6dea5441..0f89f5190cd7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2956,14 +2956,17 @@ again:
if (err < 0)
goto out;
- } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+ partial.state == initial) {
/*
- * If there's an extent to the right its first cluster
- * contains the immediate right boundary of the
- * truncated/punched region. Set partial_cluster to
- * its negative value so it won't be freed if shared
- * with the current extent. The end < ee_block case
- * is handled in ext4_ext_rm_leaf().
+ * If we're punching, there's an extent to the right.
+ * If the partial cluster hasn't been set, set it to
+ * that extent's first cluster and its state to nofree
+ * so it won't be freed should it contain blocks to be
+ * removed. If it's already set (tofree/nofree), we're
+ * retrying and keep the original partial cluster info
+ * so a cluster marked tofree as a result of earlier
+ * extent removal is not lost.
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
@@ -3631,7 +3634,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
max_zeroout = 0;
/*
@@ -4048,18 +4051,8 @@ out:
} else
allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
- /*
- * if we allocated more blocks than requested
- * we need to make sure we unmap the extra block
- * allocated. The actual needed block will get
- * unmapped later when we find the buffer_head marked
- * new.
- */
- if (allocated > map->m_len) {
- clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
- allocated - map->m_len);
+ if (allocated > map->m_len)
allocated = map->m_len;
- }
map->m_len = allocated;
map_out:
@@ -4818,7 +4811,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* leave it disabled for encrypted inodes for now. This is a
* bug we should fix....
*/
- if (ext4_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
FALLOC_FL_ZERO_RANGE)))
return -EOPNOTSUPP;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e22dcfab308b..46b24da33a28 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_TEA:
p = name;
while (len > 0) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7ff14a1adba3..f3e17a8c84b4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
!(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa1507e81..c2225f0d31b5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1183,18 +1183,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
@@ -1433,6 +1436,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
if (++n >= n2)
return 0;
@@ -1441,6 +1445,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
if (++n >= n2)
return 0;
@@ -1449,6 +1454,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 34d7e0703cc6..b54b261ded36 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* inode's preallocations.
*/
if ((ei->i_reserved_data_blocks == 0) &&
- (atomic_read(&inode->i_writecount) == 0))
+ !inode_is_open_for_write(inode))
ext4_discard_preallocations(inode);
}
@@ -415,7 +415,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -678,8 +678,6 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -1150,7 +1148,7 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
@@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -1217,8 +1214,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++ = bh;
- decrypt = ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode);
+ decrypt = IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
}
/*
@@ -1303,7 +1299,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block_unwritten);
@@ -2490,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
BUG_ON(map->m_len == 0);
- if (map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
- }
return 0;
}
@@ -2836,12 +2828,12 @@ retry:
goto unplug;
}
ret = mpage_prepare_extent_to_map(&mpd);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
ext4_put_io_end_defer(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
- /* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
if (ret < 0)
goto unplug;
@@ -2909,10 +2901,11 @@ retry:
handle = NULL;
mpd.do_map = 0;
}
- /* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+
/*
* Drop our io_end reference we got from init. We have
* to be careful and use deferred io_end finishing if
@@ -3105,7 +3098,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(page, pos, len,
ext4_da_get_block_prep);
#else
@@ -3880,8 +3873,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+#ifdef CONFIG_FS_ENCRYPTION
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
@@ -4065,8 +4058,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) &&
- ext4_encrypted_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
@@ -4142,7 +4134,7 @@ static int ext4_block_truncate_page(handle_t *handle,
struct inode *inode = mapping->host;
/* If we are processing an encrypted inode during orphan list handling */
- if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
blocksize = inode->i_sb->s_blocksize;
@@ -4722,7 +4714,7 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_has_inline_data(inode))
return false;
- if (ext4_encrypted_inode(inode))
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
return true;
}
@@ -5072,7 +5064,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
@@ -5351,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle,
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err)
goto out_brelse;
- ext4_update_dynamic_rev(sb);
ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
@@ -6002,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
- BUFFER_TRACE(iloc.bh, "get_write_access");
+ BUFFER_TRACE(iloc->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37dafa1d133..3c4f8bb59f8a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
+ unsigned long tmp;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_blocks, inode2->i_blocks);
- swap(inode1->i_bytes, inode2->i_bytes);
swap(inode1->i_atime, inode2->i_atime);
swap(inode1->i_mtime, inode2->i_mtime);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
- swap(ei1->i_flags, ei2->i_flags);
+ tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+ ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+ (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
swap(ei1->i_disksize, ei2->i_disksize);
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb,
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
-
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
- IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
- ext4_has_inline_data(inode))
- return -EINVAL;
-
- if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ qsize_t size, size_bl, diff;
+ blkcnt_t blocks;
+ unsigned short bytes;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
- filemap_flush(inode->i_mapping);
- filemap_flush(inode_bl->i_mapping);
-
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+ ext4_has_inline_data(inode)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto journal_err_out;
+ }
+
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto err_out;
+
+ err = filemap_write_and_wait(inode_bl->i_mapping);
+ if (err)
+ goto err_out;
+
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto journal_err_out;
+ goto err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out1;
+
+ size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+ size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+ diff = size - size_bl;
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,34 +206,58 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
+ /* No need to update quota information. */
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
- } else {
- err = ext4_mark_inode_dirty(handle, inode_bl);
- if (err < 0) {
- ext4_warning(inode_bl->i_sb,
- "couldn't mark inode #%lu dirty (err %d)",
- inode_bl->i_ino, err);
- /* Revert all changes: */
- swap_inode_data(inode, inode_bl);
- ext4_mark_inode_dirty(handle, inode);
- ext4_mark_inode_dirty(handle, inode_bl);
- }
+ goto err_out1;
+ }
+
+ blocks = inode_bl->i_blocks;
+ bytes = inode_bl->i_bytes;
+ inode_bl->i_blocks = inode->i_blocks;
+ inode_bl->i_bytes = inode->i_bytes;
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ goto revert;
+ }
+
+ /* Bootloader inode should not be counted into quota information. */
+ if (diff > 0)
+ dquot_free_space(inode, diff);
+ else
+ err = dquot_alloc_space(inode, -1 * diff);
+
+ if (err < 0) {
+revert:
+ /* Revert all changes: */
+ inode_bl->i_blocks = blocks;
+ inode_bl->i_bytes = bytes;
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
}
+
+err_out1:
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
+err_out:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -978,7 +1025,7 @@ resizefs_out:
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2248083cdca..6fb76d408093 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- if ((size == isize) &&
- !ext4_fs_is_busy(sbi) &&
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+ !inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
return 0;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2f5be02fc6f6..1083a9f3f16a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -592,8 +592,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
- if (ext4_encrypted_inode(orig_inode) ||
- ext4_encrypted_inode(donor_inode)) {
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2b928eb07fa2..980166a8122a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir,
{
if (show_names)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int len;
char *name;
struct fscrypt_str fname_crypto_str =
@@ -621,7 +621,7 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (ext4_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
res = fscrypt_get_encryption_info(dir);
if (res) {
printk(KERN_WARNING "Error setting up"
@@ -984,9 +984,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
- if (ext4_encrypted_inode(dir)) {
+ if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
if (err < 0) {
brelse(bh);
@@ -1015,7 +1015,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- if (!ext4_encrypted_inode(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
tmp_str.name = de->name;
tmp_str.len = de->name_len;
err = ext4_htree_store_dirent(dir_file,
@@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
errout:
brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
#endif
return count;
@@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname,
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
f.crypto_buf = fname->crypto_buf;
#endif
return fscrypt_match_name(&f, de->name, de->name_len);
@@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
ext4_lblk_t block;
int retval;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
frame = dx_probe(fname, dir, NULL, frames);
@@ -1578,7 +1578,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
ino);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ext4_warning(inode->i_sb,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2aa62d58d8dd..3e9298e6a705 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -63,10 +63,11 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct page *data_page = NULL;
#endif
struct buffer_head *bh, *head;
@@ -78,7 +79,7 @@ static void ext4_finish_bio(struct bio *bio)
if (!page)
continue;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!page->mapping) {
/* The bounce data pages are unmapped. */
data_page = page;
@@ -111,7 +112,7 @@ static void ext4_finish_bio(struct bio *bio)
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (data_page)
fscrypt_restore_control_page(data_page);
#endif
@@ -467,18 +468,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
- if (buffer_new(bh)) {
+ if (buffer_new(bh))
clear_buffer_new(bh);
- clean_bdev_bh_alias(bh);
- }
set_buffer_async_write(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
- nr_to_submit) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6aa282ee455a..3adadf461825 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -49,7 +49,7 @@
static inline bool ext4_bio_encrypted(struct bio *bio)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return unlikely(bio->bi_private != NULL);
#else
return false;
@@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) {
@@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
return;
}
}
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -242,8 +243,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (bio == NULL) {
struct fscrypt_ctx *ctx = NULL;
- if (ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 48421de803b7..3d9b18505c0c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1960,7 +1960,8 @@ retry:
le16_to_cpu(es->s_reserved_gdt_blocks);
n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
n_blocks_count = (ext4_fsblk_t)n_group *
- EXT4_BLOCKS_PER_GROUP(sb);
+ EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(es->s_first_data_block);
n_group--; /* set to last group number */
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb12d3c17c1b..f5b828bf1299 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1232,7 +1232,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return try_to_free_buffers(page);
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1922,7 +1922,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
ext4_msg(sb, KERN_WARNING,
"Test dummy encryption mode enabled");
@@ -2249,7 +2249,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
@@ -4167,7 +4166,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..616c075da062 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@ typedef enum {
attr_feature,
attr_pointer_ui,
attr_pointer_atomic,
+ attr_journal_task,
} attr_id_t;
typedef enum {
@@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
return count;
}
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+ if (!sbi->s_journal)
+ return snprintf(buf, PAGE_SIZE, "<none>\n");
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ task_pid_vnr(sbi->s_journal->j_task));
+}
+
#define EXT4_ATTR(_name,_mode,_id) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(errors_count),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
+ ATTR_LIST(journal_task),
NULL,
};
@@ -224,7 +235,7 @@ static struct attribute *ext4_attrs[] = {
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
@@ -233,7 +244,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
ATTR_LIST(metadata_csum_seed),
@@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
return print_tstamp(buf, sbi->s_es, s_last_error_time);
+ case attr_journal_task:
+ return journal_task_show(sbi, buf);
}
return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86ed9c686249..dc82e7757f67 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
@@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (error == -EIO)
EXT4_ERROR_INODE(inode, "block %llu read error",
EXT4_I(inode)->i_file_acl);
+ bh = NULL;
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
@@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode,
if (IS_ERR(bh)) {
if (PTR_ERR(bh) == -ENOMEM)
return NULL;
+ bh = NULL;
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 9a20ef42fadd..e57cc754d543 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -3,6 +3,7 @@ config F2FS_FS
depends on BLOCK
select CRYPTO
select CRYPTO_CRC32
+ select F2FS_FS_XATTR if FS_ENCRYPTION
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -70,17 +71,6 @@ config F2FS_CHECK_FS
If you want to improve the performance, say N.
-config F2FS_FS_ENCRYPTION
- bool "F2FS Encryption"
- depends on F2FS_FS
- depends on F2FS_FS_XATTR
- select FS_ENCRYPTION
- help
- Enable encryption of f2fs files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config F2FS_IO_TRACE
bool "F2FS IO tracer"
depends on F2FS_FS
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f91d8630c9a2..568e1d09eb48 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
struct page *page;
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
page = bv->bv_page;
/* PG_error was set if any post_read step failed */
@@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
@@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
struct bio_vec *bvec;
struct page *target;
int i;
+ struct bvec_iter_all iter_all;
if (!io->bio)
return false;
@@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
if (!inode && !page && !ino)
return true;
- bio_for_each_segment_all(bvec, io->bio, i) {
+ bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
if (bvec->bv_page->mapping)
target = bvec->bv_page;
@@ -1465,7 +1468,7 @@ next:
}
if (size) {
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
ret = fiemap_fill_next_extent(fieinfo, logical,
@@ -1736,7 +1739,7 @@ static inline bool check_inplace_update_policy(struct inode *inode,
if (policy & (0x1 << F2FS_IPU_ASYNC) &&
fio && fio->op == REQ_OP_WRITE &&
!(fio->op_flags & REQ_SYNC) &&
- !f2fs_encrypted_inode(inode))
+ !IS_ENCRYPTED(inode))
return true;
/* this is only set during fdatasync */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ebcc121920ba..fd7f170e2f2d 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -506,30 +506,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kvfree(si);
}
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
{
- struct dentry *file;
-
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
- if (!f2fs_debugfs_root)
- return -ENOMEM;
- file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
- NULL, &stat_fops);
- if (!file) {
- debugfs_remove(f2fs_debugfs_root);
- f2fs_debugfs_root = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+ &stat_fops);
}
void f2fs_destroy_root_stats(void)
{
- if (!f2fs_debugfs_root)
- return;
-
debugfs_remove_recursive(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 50d0d36280fa..713b36a10a79 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -385,7 +385,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+ if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
f2fs_may_encrypt(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
@@ -399,7 +399,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (new_name) {
init_dent_inode(new_name, page);
- if (f2fs_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
file_set_enc_name(inode);
}
@@ -819,7 +819,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
goto out;
}
- if (f2fs_encrypted_inode(d->inode)) {
+ if (IS_ENCRYPTED(d->inode)) {
int save_len = fstr->len;
err = fscrypt_fname_disk_to_usr(d->inode,
@@ -862,7 +862,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
int err = 0;
- if (f2fs_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
goto out;
@@ -924,7 +924,7 @@ out:
static int f2fs_dir_open(struct inode *inode, struct file *filp)
{
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 12fabd6735dd..7ea5c9cede37 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,6 @@
#include <linux/quotaops.h>
#include <crypto/hash.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#ifdef CONFIG_F2FS_CHECK_FS
@@ -1137,7 +1136,7 @@ enum fsync_mode {
FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) \
(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
#else
@@ -3328,7 +3327,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si) do { } while (0)
@@ -3366,7 +3365,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -3463,19 +3462,14 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
/*
* crypto support
*/
-static inline bool f2fs_encrypted_inode(struct inode *inode)
-{
- return file_is_encrypt(inode);
-}
-
static inline bool f2fs_encrypted_file(struct inode *inode)
{
- return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+ return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
file_set_encrypt(inode);
f2fs_set_inode_flags(inode);
#endif
@@ -3554,7 +3548,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
static inline bool f2fs_may_encrypt(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bba56b39dcc5..ba5954f41e14 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -582,7 +582,7 @@ truncate_out:
zero_user(page, offset, PAGE_SIZE - offset);
/* An encrypted inode should have a key and truncate the last page. */
- f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
if (!cache_only)
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -711,7 +711,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_APPEND;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
stat->attributes |= STATX_ATTR_ENCRYPTED;
if (flags & F2FS_IMMUTABLE_FL)
stat->attributes |= STATX_ATTR_IMMUTABLE;
@@ -1563,7 +1563,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1647,7 +1647,7 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int flags = fi->i_flags;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= F2FS_ENCRYPT_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
flags |= F2FS_INLINE_DATA_FL;
@@ -2414,7 +2414,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst))
return -EOPNOTSUPP;
if (src == dst) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index bec52961630b..d910a820ae67 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,7 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & F2FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
@@ -453,7 +453,7 @@ make_now:
inode->i_mapping->a_ops = &f2fs_dblock_aops;
inode_nohighmem(inode);
} else if (S_ISLNK(inode->i_mode)) {
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 62d9829f3a6a..e967d27c1a89 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -75,7 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
/* If the directory encrypted, then we should encrypt the inode. */
- if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
@@ -476,7 +476,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out_iput;
}
- if (f2fs_encrypted_inode(dir) &&
+ if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
f2fs_msg(inode->i_sb, KERN_WARNING,
@@ -803,7 +803,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c46a1d4318d4..d1ccc52afc93 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
break;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
return -EINVAL;
@@ -1390,7 +1390,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",whint_mode=%s", "user-based");
else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (F2FS_OPTION(sbi).test_dummy_encryption)
seq_puts(seq, ",test_dummy_encryption");
#endif
@@ -2154,7 +2154,7 @@ static const struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
{
return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
@@ -3116,7 +3116,7 @@ try_onemore:
#endif
sb->s_op = &f2fs_sops;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
#endif
sb->s_xattr = f2fs_xattr_handlers;
@@ -3545,9 +3545,7 @@ static int __init init_f2fs_fs(void)
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- err = f2fs_create_root_stats();
- if (err)
- goto free_filesystem;
+ f2fs_create_root_stats();
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
@@ -3555,7 +3553,6 @@ static int __init init_f2fs_fs(void)
free_root_stats:
f2fs_destroy_root_stats();
-free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 0575edbe3ed6..70da6801c86f 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -431,7 +431,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
#endif
#ifdef CONFIG_BLK_DEV_ZONED
@@ -492,7 +492,7 @@ static struct attribute *f2fs_attrs[] = {
};
static struct attribute *f2fs_feat_attrs[] = {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
#ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 13935ee99e1e..b3bed32946b1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
@@ -705,7 +706,7 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
{
struct files_struct *files = current->files;
struct file *file;
@@ -720,7 +721,7 @@ loop:
*/
if (file->f_mode & mask)
file = NULL;
- else if (!get_file_rcu(file))
+ else if (!get_file_rcu_many(file, refs))
goto loop;
}
rcu_read_unlock();
@@ -728,15 +729,20 @@ loop:
return file;
}
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+ return __fget(fd, FMODE_PATH, refs);
+}
+
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH);
+ return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0);
+ return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);
@@ -767,7 +773,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask);
+ file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index 5679e7fcb6b0..155d7514a094 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,9 +326,9 @@ void flush_delayed_fput(void)
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (atomic_long_sub_and_test(refs, &file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +347,11 @@ void fput(struct file *file)
}
}
+void fput(struct file *file)
+{
+ fput_many(file, 1);
+}
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
diff --git a/fs/filesystems.c b/fs/filesystems.c
index b03f57b1105b..9135646e41ac 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fs_parser.h>
/*
* Handling of filesystem drivers list.
@@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs)
int res = 0;
struct file_system_type ** p;
+ if (fs->parameters && !fs_validate_description(fs->parameters))
+ return -EINVAL;
+
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
diff --git a/fs/fs_context.c b/fs/fs_context.c
new file mode 100644
index 000000000000..87e3546b9a52
--- /dev/null
+++ b/fs/fs_context.c
@@ -0,0 +1,642 @@
+/* Provide a way to create a superblock configuration context within the kernel
+ * that allows a superblock to be set up prior to mounting.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/security.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include "mount.h"
+#include "internal.h"
+
+enum legacy_fs_param {
+ LEGACY_FS_UNSET_PARAMS,
+ LEGACY_FS_MONOLITHIC_PARAMS,
+ LEGACY_FS_INDIVIDUAL_PARAMS,
+};
+
+struct legacy_fs_context {
+ char *legacy_data; /* Data page for legacy filesystems */
+ size_t data_size;
+ enum legacy_fs_param param_type;
+};
+
+static int legacy_init_fs_context(struct fs_context *fc);
+
+static const struct constant_table common_set_sb_flag[] = {
+ { "dirsync", SB_DIRSYNC },
+ { "lazytime", SB_LAZYTIME },
+ { "mand", SB_MANDLOCK },
+ { "posixacl", SB_POSIXACL },
+ { "ro", SB_RDONLY },
+ { "sync", SB_SYNCHRONOUS },
+};
+
+static const struct constant_table common_clear_sb_flag[] = {
+ { "async", SB_SYNCHRONOUS },
+ { "nolazytime", SB_LAZYTIME },
+ { "nomand", SB_MANDLOCK },
+ { "rw", SB_RDONLY },
+ { "silent", SB_SILENT },
+};
+
+static const char *const forbidden_sb_flag[] = {
+ "bind",
+ "dev",
+ "exec",
+ "move",
+ "noatime",
+ "nodev",
+ "nodiratime",
+ "noexec",
+ "norelatime",
+ "nostrictatime",
+ "nosuid",
+ "private",
+ "rec",
+ "relatime",
+ "remount",
+ "shared",
+ "slave",
+ "strictatime",
+ "suid",
+ "unbindable",
+};
+
+/*
+ * Check for a common mount option that manipulates s_flags.
+ */
+static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
+{
+ unsigned int token;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++)
+ if (strcmp(key, forbidden_sb_flag[i]) == 0)
+ return -EINVAL;
+
+ token = lookup_constant(common_set_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags |= token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ token = lookup_constant(common_clear_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags &= ~token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ return -ENOPARAM;
+}
+
+/**
+ * vfs_parse_fs_param - Add a single parameter to a superblock config
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * A single mount option in string form is applied to the filesystem context
+ * being set up. Certain standard options (for example "ro") are translated
+ * into flag bits without going to the filesystem. The active security module
+ * is allowed to observe and poach options. Any other options are passed over
+ * to the filesystem to parse.
+ *
+ * This may be called multiple times for a context.
+ *
+ * Returns 0 on success and a negative error code on failure. In the event of
+ * failure, supplementary error information may have been set.
+ */
+int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ int ret;
+
+ if (!param->key)
+ return invalf(fc, "Unnamed parameter\n");
+
+ ret = vfs_parse_sb_flag(fc, param->key);
+ if (ret != -ENOPARAM)
+ return ret;
+
+ ret = security_fs_context_parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ /* Param belongs to the LSM or is disallowed by the LSM; so
+ * don't pass to the FS.
+ */
+ return ret;
+
+ if (fc->ops->parse_param) {
+ ret = fc->ops->parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
+ }
+
+ /* If the filesystem doesn't take any arguments, give it the
+ * default handling of source.
+ */
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ return invalf(fc, "%s: Unknown parameter '%s'",
+ fc->fs_type->name, param->key);
+}
+EXPORT_SYMBOL(vfs_parse_fs_param);
+
+/**
+ * vfs_parse_fs_string - Convenience function to just parse a string.
+ */
+int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+ const char *value, size_t v_size)
+{
+ int ret;
+
+ struct fs_parameter param = {
+ .key = key,
+ .type = fs_value_is_string,
+ .size = v_size,
+ };
+
+ if (v_size > 0) {
+ param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ }
+
+ ret = vfs_parse_fs_param(fc, &param);
+ kfree(param.string);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_parse_fs_string);
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @ctx: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+ char *options = data, *key;
+ int ret = 0;
+
+ if (!options)
+ return 0;
+
+ ret = security_sb_eat_lsm_opts(options, &fc->security);
+ if (ret)
+ return ret;
+
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ if (value) {
+ if (value == key)
+ continue;
+ *value++ = 0;
+ v_len = strlen(value);
+ }
+ ret = vfs_parse_fs_string(fc, key, value, v_len);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(generic_parse_monolithic);
+
+/**
+ * alloc_fs_context - Create a filesystem context.
+ * @fs_type: The filesystem type.
+ * @reference: The dentry from which this one derives (or NULL)
+ * @sb_flags: Filesystem/superblock flags (SB_*)
+ * @sb_flags_mask: Applicable members of @sb_flags
+ * @purpose: The purpose that this configuration shall be used for.
+ *
+ * Open a filesystem and create a mount context. The mount context is
+ * initialised with the supplied flags and, if a submount/automount from
+ * another superblock (referred to by @reference) is supplied, may have
+ * parameters such as namespaces copied across from that superblock.
+ */
+static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
+ struct dentry *reference,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask,
+ enum fs_context_purpose purpose)
+{
+ int (*init_fs_context)(struct fs_context *);
+ struct fs_context *fc;
+ int ret = -ENOMEM;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->purpose = purpose;
+ fc->sb_flags = sb_flags;
+ fc->sb_flags_mask = sb_flags_mask;
+ fc->fs_type = get_filesystem(fs_type);
+ fc->cred = get_current_cred();
+ fc->net_ns = get_net(current->nsproxy->net_ns);
+
+ switch (purpose) {
+ case FS_CONTEXT_FOR_MOUNT:
+ fc->user_ns = get_user_ns(fc->cred->user_ns);
+ break;
+ case FS_CONTEXT_FOR_SUBMOUNT:
+ fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+ break;
+ case FS_CONTEXT_FOR_RECONFIGURE:
+ /* We don't pin any namespaces as the superblock's
+ * subscriptions cannot be changed at this point.
+ */
+ atomic_inc(&reference->d_sb->s_active);
+ fc->root = dget(reference);
+ break;
+ }
+
+ /* TODO: Make all filesystems support this unconditionally */
+ init_fs_context = fc->fs_type->init_fs_context;
+ if (!init_fs_context)
+ init_fs_context = legacy_init_fs_context;
+
+ ret = init_fs_context(fc);
+ if (ret < 0)
+ goto err_fc;
+ fc->need_free = true;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+
+struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+ unsigned int sb_flags)
+{
+ return alloc_fs_context(fs_type, NULL, sb_flags, 0,
+ FS_CONTEXT_FOR_MOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_mount);
+
+struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask)
+{
+ return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
+ sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
+}
+EXPORT_SYMBOL(fs_context_for_reconfigure);
+
+struct fs_context *fs_context_for_submount(struct file_system_type *type,
+ struct dentry *reference)
+{
+ return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_submount);
+
+void fc_drop_locked(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_locked_super(sb);
+}
+
+static void legacy_fs_context_free(struct fs_context *fc);
+
+/**
+ * vfs_dup_fc_config: Duplicate a filesystem context.
+ * @src_fc: The context to copy.
+ */
+struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
+{
+ struct fs_context *fc;
+ int ret;
+
+ if (!src_fc->ops->dup)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->fs_private = NULL;
+ fc->s_fs_info = NULL;
+ fc->source = NULL;
+ fc->security = NULL;
+ get_filesystem(fc->fs_type);
+ get_net(fc->net_ns);
+ get_user_ns(fc->user_ns);
+ get_cred(fc->cred);
+
+ /* Can't call put until we've called ->dup */
+ ret = fc->ops->dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+
+ ret = security_fs_context_dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(vfs_dup_fs_context);
+
+#ifdef CONFIG_PRINTK
+/**
+ * logfc - Log a message to a filesystem context
+ * @fc: The filesystem context to log to.
+ * @fmt: The format of the buffer.
+ */
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+ va_list va;
+
+ va_start(va, fmt);
+
+ switch (fmt[0]) {
+ case 'w':
+ vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va);
+ break;
+ case 'e':
+ vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va);
+ break;
+ default:
+ vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va);
+ break;
+ }
+
+ pr_cont("\n");
+ va_end(va);
+}
+EXPORT_SYMBOL(logfc);
+#endif
+
+/**
+ * put_fs_context - Dispose of a superblock configuration context.
+ * @fc: The context to dispose of.
+ */
+void put_fs_context(struct fs_context *fc)
+{
+ struct super_block *sb;
+
+ if (fc->root) {
+ sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_super(sb);
+ }
+
+ if (fc->need_free && fc->ops && fc->ops->free)
+ fc->ops->free(fc);
+
+ security_free_mnt_opts(&fc->security);
+ put_net(fc->net_ns);
+ put_user_ns(fc->user_ns);
+ put_cred(fc->cred);
+ kfree(fc->subtype);
+ put_filesystem(fc->fs_type);
+ kfree(fc->source);
+ kfree(fc);
+}
+EXPORT_SYMBOL(put_fs_context);
+
+/*
+ * Free the config for a filesystem that doesn't support fs_context.
+ */
+static void legacy_fs_context_free(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
+ kfree(ctx->legacy_data);
+ kfree(ctx);
+ }
+}
+
+/*
+ * Duplicate a legacy config.
+ */
+static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct legacy_fs_context *ctx;
+ struct legacy_fs_context *src_ctx = src_fc->fs_private;
+
+ ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
+ ctx->legacy_data = kmemdup(src_ctx->legacy_data,
+ src_ctx->data_size, GFP_KERNEL);
+ if (!ctx->legacy_data) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+
+ fc->fs_private = ctx;
+ return 0;
+}
+
+/*
+ * Add a parameter to a legacy config. We build up a comma-separated list of
+ * options.
+ */
+static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ unsigned int size = ctx->data_size;
+ size_t len = 0;
+
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Legacy: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
+ strcmp(param->key, "subtype") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string subtype");
+ if (fc->subtype)
+ return invalf(fc, "VFS: Legacy: Multiple subtype");
+ fc->subtype = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
+ return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
+
+ switch (param->type) {
+ case fs_value_is_string:
+ len = 1 + param->size;
+ /* Fall through */
+ case fs_value_is_flag:
+ len += strlen(param->key);
+ break;
+ default:
+ return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
+ param->key);
+ }
+
+ if (len > PAGE_SIZE - 2 - size)
+ return invalf(fc, "VFS: Legacy: Cumulative options too large");
+ if (strchr(param->key, ',') ||
+ (param->type == fs_value_is_string &&
+ memchr(param->string, ',', param->size)))
+ return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
+ param->key);
+ if (!ctx->legacy_data) {
+ ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ctx->legacy_data)
+ return -ENOMEM;
+ }
+
+ ctx->legacy_data[size++] = ',';
+ len = strlen(param->key);
+ memcpy(ctx->legacy_data + size, param->key, len);
+ size += len;
+ if (param->type == fs_value_is_string) {
+ ctx->legacy_data[size++] = '=';
+ memcpy(ctx->legacy_data + size, param->string, param->size);
+ size += param->size;
+ }
+ ctx->legacy_data[size] = '\0';
+ ctx->data_size = size;
+ ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
+ return 0;
+}
+
+/*
+ * Add monolithic mount data.
+ */
+static int legacy_parse_monolithic(struct fs_context *fc, void *data)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
+ pr_warn("VFS: Can't mix monolithic and individual options\n");
+ return -EINVAL;
+ }
+
+ ctx->legacy_data = data;
+ ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
+ if (!ctx->legacy_data)
+ return 0;
+
+ if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
+ return 0;
+ return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
+}
+
+/*
+ * Get a mountable root with the legacy mount command.
+ */
+static int legacy_get_tree(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb;
+ struct dentry *root;
+
+ root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
+ fc->source, ctx->legacy_data);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ sb = root->d_sb;
+ BUG_ON(!sb);
+
+ fc->root = root;
+ return 0;
+}
+
+/*
+ * Handle remount.
+ */
+static int legacy_reconfigure(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+
+ if (!sb->s_op->remount_fs)
+ return 0;
+
+ return sb->s_op->remount_fs(sb, &fc->sb_flags,
+ ctx ? ctx->legacy_data : NULL);
+}
+
+const struct fs_context_operations legacy_fs_context_ops = {
+ .free = legacy_fs_context_free,
+ .dup = legacy_fs_context_dup,
+ .parse_param = legacy_parse_param,
+ .parse_monolithic = legacy_parse_monolithic,
+ .get_tree = legacy_get_tree,
+ .reconfigure = legacy_reconfigure,
+};
+
+/*
+ * Initialise a legacy context for a filesystem that doesn't support
+ * fs_context.
+ */
+static int legacy_init_fs_context(struct fs_context *fc)
+{
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ if (!fc->fs_private)
+ return -ENOMEM;
+ fc->ops = &legacy_fs_context_ops;
+ return 0;
+}
+
+int parse_monolithic_mount_data(struct fs_context *fc, void *data)
+{
+ int (*monolithic_mount_data)(struct fs_context *, void *);
+
+ monolithic_mount_data = fc->ops->parse_monolithic;
+ if (!monolithic_mount_data)
+ monolithic_mount_data = generic_parse_monolithic;
+
+ return monolithic_mount_data(fc, data);
+}
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
new file mode 100644
index 000000000000..842e8f749db6
--- /dev/null
+++ b/fs/fs_parser.c
@@ -0,0 +1,447 @@
+/* Filesystem parameter parser.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct constant_table bool_names[] = {
+ { "0", false },
+ { "1", true },
+ { "false", false },
+ { "no", false },
+ { "true", true },
+ { "yes", true },
+};
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @tbl_size: The size of the table.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int __lookup_constant(const struct constant_table *tbl, size_t tbl_size,
+ const char *name, int not_found)
+{
+ unsigned int i;
+
+ for (i = 0; i < tbl_size; i++)
+ if (strcmp(name, tbl[i].name) == 0)
+ return tbl[i].value;
+
+ return not_found;
+}
+EXPORT_SYMBOL(__lookup_constant);
+
+static const struct fs_parameter_spec *fs_lookup_key(
+ const struct fs_parameter_description *desc,
+ const char *name)
+{
+ const struct fs_parameter_spec *p;
+
+ if (!desc->specs)
+ return NULL;
+
+ for (p = desc->specs; p->name; p++)
+ if (strcmp(p->name, name) == 0)
+ return p;
+
+ return NULL;
+}
+
+/*
+ * fs_parse - Parse a filesystem configuration parameter
+ * @fc: The filesystem context to log errors through.
+ * @desc: The parameter description to use.
+ * @param: The parameter.
+ * @result: Where to place the result of the parse
+ *
+ * Parse a filesystem configuration parameter and attempt a conversion for a
+ * simple parameter for which this is requested. If successful, the determined
+ * parameter ID is placed into @result->key, the desired type is indicated in
+ * @result->t and any converted value is placed into an appropriate member of
+ * the union in @result.
+ *
+ * The function returns the parameter number if the parameter was matched,
+ * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
+ * unknown parameters are okay and -EINVAL if there was a conversion issue or
+ * the parameter wasn't recognised and unknowns aren't okay.
+ */
+int fs_parse(struct fs_context *fc,
+ const struct fs_parameter_description *desc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ const struct fs_parameter_spec *p;
+ const struct fs_parameter_enum *e;
+ int ret = -ENOPARAM, b;
+
+ result->has_value = !!param->string;
+ result->negated = false;
+ result->uint_64 = 0;
+
+ p = fs_lookup_key(desc, param->key);
+ if (!p) {
+ /* If we didn't find something that looks like "noxxx", see if
+ * "xxx" takes the "no"-form negative - but only if there
+ * wasn't an value.
+ */
+ if (result->has_value)
+ goto unknown_parameter;
+ if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2])
+ goto unknown_parameter;
+
+ p = fs_lookup_key(desc, param->key + 2);
+ if (!p)
+ goto unknown_parameter;
+ if (!(p->flags & fs_param_neg_with_no))
+ goto unknown_parameter;
+ result->boolean = false;
+ result->negated = true;
+ }
+
+ if (p->flags & fs_param_deprecated)
+ warnf(fc, "%s: Deprecated parameter '%s'",
+ desc->name, param->key);
+
+ if (result->negated)
+ goto okay;
+
+ /* Certain parameter types only take a string and convert it. */
+ switch (p->type) {
+ case __fs_param_wasnt_defined:
+ return -EINVAL;
+ case fs_param_is_u32:
+ case fs_param_is_u32_octal:
+ case fs_param_is_u32_hex:
+ case fs_param_is_s32:
+ case fs_param_is_u64:
+ case fs_param_is_enum:
+ case fs_param_is_string:
+ if (param->type != fs_value_is_string)
+ goto bad_value;
+ if (!result->has_value) {
+ if (p->flags & fs_param_v_optional)
+ goto okay;
+ goto bad_value;
+ }
+ /* Fall through */
+ default:
+ break;
+ }
+
+ /* Try to turn the type we were given into the type desired by the
+ * parameter and give an error if we can't.
+ */
+ switch (p->type) {
+ case fs_param_is_flag:
+ if (param->type != fs_value_is_flag &&
+ (param->type != fs_value_is_string || result->has_value))
+ return invalf(fc, "%s: Unexpected value for '%s'",
+ desc->name, param->key);
+ result->boolean = true;
+ goto okay;
+
+ case fs_param_is_bool:
+ switch (param->type) {
+ case fs_value_is_flag:
+ result->boolean = true;
+ goto okay;
+ case fs_value_is_string:
+ if (param->size == 0) {
+ result->boolean = true;
+ goto okay;
+ }
+ b = lookup_constant(bool_names, param->string, -1);
+ if (b == -1)
+ goto bad_value;
+ result->boolean = b;
+ goto okay;
+ default:
+ goto bad_value;
+ }
+
+ case fs_param_is_u32:
+ ret = kstrtouint(param->string, 0, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_octal:
+ ret = kstrtouint(param->string, 8, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_hex:
+ ret = kstrtouint(param->string, 16, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_s32:
+ ret = kstrtoint(param->string, 0, &result->int_32);
+ goto maybe_okay;
+ case fs_param_is_u64:
+ ret = kstrtoull(param->string, 0, &result->uint_64);
+ goto maybe_okay;
+
+ case fs_param_is_enum:
+ for (e = desc->enums; e->name[0]; e++) {
+ if (e->opt == p->opt &&
+ strcmp(e->name, param->string) == 0) {
+ result->uint_32 = e->value;
+ goto okay;
+ }
+ }
+ goto bad_value;
+
+ case fs_param_is_string:
+ goto okay;
+ case fs_param_is_blob:
+ if (param->type != fs_value_is_blob)
+ goto bad_value;
+ goto okay;
+
+ case fs_param_is_fd: {
+ if (param->type != fs_value_is_file)
+ goto bad_value;
+ goto okay;
+ }
+
+ case fs_param_is_blockdev:
+ case fs_param_is_path:
+ goto okay;
+ default:
+ BUG();
+ }
+
+maybe_okay:
+ if (ret < 0)
+ goto bad_value;
+okay:
+ return p->opt;
+
+bad_value:
+ return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key);
+unknown_parameter:
+ return -ENOPARAM;
+}
+EXPORT_SYMBOL(fs_parse);
+
+/**
+ * fs_lookup_param - Look up a path referred to by a parameter
+ * @fc: The filesystem context to log errors through.
+ * @param: The parameter.
+ * @want_bdev: T if want a blockdev
+ * @_path: The result of the lookup
+ */
+int fs_lookup_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ bool want_bdev,
+ struct path *_path)
+{
+ struct filename *f;
+ unsigned int flags = 0;
+ bool put_f;
+ int ret;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ f = getname_kernel(param->string);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ put_f = true;
+ break;
+ case fs_value_is_filename_empty:
+ flags = LOOKUP_EMPTY;
+ /* Fall through */
+ case fs_value_is_filename:
+ f = param->name;
+ put_f = false;
+ break;
+ default:
+ return invalf(fc, "%s: not usable as path", param->key);
+ }
+
+ ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
+ if (ret < 0) {
+ errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
+ goto out;
+ }
+
+ if (want_bdev &&
+ !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
+ path_put(_path);
+ _path->dentry = NULL;
+ _path->mnt = NULL;
+ errorf(fc, "%s: Non-blockdev passed as '%s'",
+ param->key, f->name);
+ ret = -ENOTBLK;
+ }
+
+out:
+ if (put_f)
+ putname(f);
+ return ret;
+}
+EXPORT_SYMBOL(fs_lookup_param);
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+/**
+ * validate_constant_table - Validate a constant table
+ * @name: Name to use in reporting
+ * @tbl: The constant table to validate.
+ * @tbl_size: The size of the table.
+ * @low: The lowest permissible value.
+ * @high: The highest permissible value.
+ * @special: One special permissible value outside of the range.
+ */
+bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+ int low, int high, int special)
+{
+ size_t i;
+ bool good = true;
+
+ if (tbl_size == 0) {
+ pr_warn("VALIDATE C-TBL: Empty\n");
+ return true;
+ }
+
+ for (i = 0; i < tbl_size; i++) {
+ if (!tbl[i].name) {
+ pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
+ good = false;
+ } else if (i > 0 && tbl[i - 1].name) {
+ int c = strcmp(tbl[i-1].name, tbl[i].name);
+
+ if (c == 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
+ i, tbl[i].name);
+ good = false;
+ }
+ if (c > 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
+ i, tbl[i-1].name, tbl[i].name);
+ good = false;
+ }
+ }
+
+ if (tbl[i].value != special &&
+ (tbl[i].value < low || tbl[i].value > high)) {
+ pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
+ i, tbl[i].name, tbl[i].value, low, high);
+ good = false;
+ }
+ }
+
+ return good;
+}
+
+/**
+ * fs_validate_description - Validate a parameter description
+ * @desc: The parameter description to validate.
+ */
+bool fs_validate_description(const struct fs_parameter_description *desc)
+{
+ const struct fs_parameter_spec *param, *p2;
+ const struct fs_parameter_enum *e;
+ const char *name = desc->name;
+ unsigned int nr_params = 0;
+ bool good = true, enums = false;
+
+ pr_notice("*** VALIDATE %s ***\n", name);
+
+ if (!name[0]) {
+ pr_err("VALIDATE Parser: No name\n");
+ name = "Unknown";
+ good = false;
+ }
+
+ if (desc->specs) {
+ for (param = desc->specs; param->name; param++) {
+ enum fs_parameter_type t = param->type;
+
+ /* Check that the type is in range */
+ if (t == __fs_param_wasnt_defined ||
+ t >= nr__fs_parameter_type) {
+ pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n",
+ name, param->name, t);
+ good = false;
+ } else if (t == fs_param_is_enum) {
+ enums = true;
+ }
+
+ /* Check for duplicate parameter names */
+ for (p2 = desc->specs; p2 < param; p2++) {
+ if (strcmp(param->name, p2->name) == 0) {
+ pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ }
+
+ nr_params = param - desc->specs;
+ }
+
+ if (desc->enums) {
+ if (!nr_params) {
+ pr_err("VALIDATE %s: Enum table but no parameters\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ if (!enums) {
+ pr_err("VALIDATE %s: Enum table but no enum-type values\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+
+ for (e = desc->enums; e->name[0]; e++) {
+ /* Check that all entries in the enum table have at
+ * least one parameter that uses them.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->opt == e->opt &&
+ param->type != fs_param_is_enum) {
+ pr_err("VALIDATE %s: e[%lu] enum val for %s\n",
+ name, e - desc->enums, param->name);
+ good = false;
+ }
+ }
+ }
+
+ /* Check that all enum-type parameters have at least one enum
+ * value in the enum table.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->type != fs_param_is_enum)
+ continue;
+ for (e = desc->enums; e->name[0]; e++)
+ if (e->opt == param->opt)
+ break;
+ if (!e->name[0]) {
+ pr_err("VALIDATE %s: PARAM[%s] enum with no values\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ } else {
+ if (enums) {
+ pr_err("VALIDATE %s: enum-type values, but no enum table\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ }
+
+no_enums:
+ return good;
+}
+#endif /* CONFIG_VALIDATE_FS_PARSER */
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 000000000000..78365e5dc08c
--- /dev/null
+++ b/fs/fs_types.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+ [FT_UNKNOWN] = DT_UNKNOWN,
+ [FT_REG_FILE] = DT_REG,
+ [FT_DIR] = DT_DIR,
+ [FT_CHRDEV] = DT_CHR,
+ [FT_BLKDEV] = DT_BLK,
+ [FT_FIFO] = DT_FIFO,
+ [FT_SOCK] = DT_SOCK,
+ [FT_SYMLINK] = DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+ if (filetype >= FT_MAX)
+ return DT_UNKNOWN;
+
+ return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+ [DT_REG] = FT_REG_FILE,
+ [DT_DIR] = FT_DIR,
+ [DT_LNK] = FT_SYMLINK,
+ [DT_CHR] = FT_CHRDEV,
+ [DT_BLK] = FT_BLKDEV,
+ [DT_FIFO] = FT_FIFO,
+ [DT_SOCK] = FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN - Unknown type
+ * * FT_REG_FILE - Regular file
+ * * FT_DIR - Directory
+ * * FT_CHRDEV - Character device
+ * * FT_BLKDEV - Block device
+ * * FT_FIFO - FIFO
+ * * FT_SOCK - Local-domain socket
+ * * FT_SYMLINK - Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+ return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+ return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 989df5accaee..fe80bea4ad89 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
{
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fuse_abort_conn(fc, true);
+ if (fc->abort_err)
+ fc->aborted = true;
+ fuse_abort_conn(fc);
fuse_conn_put(fc);
}
return count;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8f68181256c0..55a26f351467 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
- fuse_sync_release(ff, file->f_flags);
+ fuse_sync_release(fi, ff, file->f_flags);
fuse_conn_put(fc);
return 0;
@@ -407,7 +408,7 @@ err_unlock:
err_region:
unregister_chrdev_region(devt, 1);
err:
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
goto out;
}
@@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
{
struct cuse_conn *cc = dev_get_drvdata(dev);
- fuse_abort_conn(&cc->fc, false);
+ fuse_abort_conn(&cc->fc);
return count;
}
static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 809c0f2f9942..8a63e52785e9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
struct file *file)
{
struct fuse_req *req = NULL;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
do {
wait_event(fc->reserved_req_waitq, ff->reserved_req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (ff->reserved_req) {
req = ff->reserved_req;
ff->reserved_req = NULL;
req->stolen_file = get_file(file);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
} while (!req);
return req;
@@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
{
struct file *file = req->stolen_file;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
WARN_ON(req->max_pages);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
memset(req, 0, sizeof(*req));
fuse_request_init(req, NULL, NULL, 0);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fput(file);
}
@@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
-
- spin_lock(&fiq->waitq.lock);
- list_del_init(&req->intr_entry);
- spin_unlock(&fiq->waitq.lock);
+ /*
+ * test_and_set_bit() implies smp_mb() between bit
+ * changing and below intr_entry check. Pairs with
+ * smp_mb() from queue_interrupt().
+ */
+ if (!list_empty(&req->intr_entry)) {
+ spin_lock(&fiq->waitq.lock);
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ }
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fc->active_background--;
flush_bg_queue(fc);
spin_unlock(&fc->bg_lock);
+ } else {
+ /* Wake up waiter sleeping in request_wait_answer() */
+ wake_up(&req->waitq);
}
- wake_up(&req->waitq);
+
if (req->end)
req->end(fc, req);
put_request:
fuse_put_request(fc, req);
}
-static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->waitq.lock);
- if (test_bit(FR_FINISHED, &req->flags)) {
+ /* Check for we've sent request to interrupt this req */
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
spin_unlock(&fiq->waitq.lock);
- return;
+ return -EINVAL;
}
+
if (list_empty(&req->intr_entry)) {
list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ return 0;
+ }
wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
}
spin_unlock(&fiq->waitq.lock);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ return 0;
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
@@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
goto err_unlock;
if (!fiq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto err_unlock;
}
@@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
spin_lock(&fpq->lock);
clear_bit(FR_LOCKED, &req->flags);
if (!fpq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto out_end;
}
if (err) {
@@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ err = -EINVAL;
if (nbytes < sizeof(struct fuse_out_header))
- return -EINVAL;
+ goto out;
err = fuse_copy_one(cs, &oh, sizeof(oh));
if (err)
- goto err_finish;
+ goto copy_finish;
err = -EINVAL;
if (oh.len != nbytes)
- goto err_finish;
+ goto copy_finish;
/*
* Zero oh.unique indicates unsolicited notification message
@@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- return err ? err : nbytes;
+ goto out;
}
err = -EINVAL;
if (oh.error <= -1000 || oh.error > 0)
- goto err_finish;
+ goto copy_finish;
spin_lock(&fpq->lock);
- err = -ENOENT;
- if (!fpq->connected)
- goto err_unlock_pq;
+ req = NULL;
+ if (fpq->connected)
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- if (!req)
- goto err_unlock_pq;
+ err = -ENOENT;
+ if (!req) {
+ spin_unlock(&fpq->lock);
+ goto copy_finish;
+ }
/* Is it an interrupt reply ID? */
if (oh.unique & FUSE_INT_REQ_BIT) {
__fuse_get_request(req);
spin_unlock(&fpq->lock);
- err = -EINVAL;
- if (nbytes != sizeof(struct fuse_out_header)) {
- fuse_put_request(fc, req);
- goto err_finish;
- }
-
- if (oh.error == -ENOSYS)
+ err = 0;
+ if (nbytes != sizeof(struct fuse_out_header))
+ err = -EINVAL;
+ else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(&fc->iq, req);
+
fuse_put_request(fc, req);
- fuse_copy_finish(cs);
- return nbytes;
+ goto copy_finish;
}
clear_bit(FR_SENT, &req->flags);
@@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_unlock(&fpq->lock);
request_end(fc, req);
-
+out:
return err ? err : nbytes;
- err_unlock_pq:
- spin_unlock(&fpq->lock);
- err_finish:
+copy_finish:
fuse_copy_finish(cs);
- return err;
+ goto out;
}
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
@@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
return mask;
}
-/*
- * Abort all requests on the given list (pending or processing)
- *
- * This function releases and reacquires fc->lock
- */
+/* Abort all requests on the given list (pending or processing) */
static void end_requests(struct fuse_conn *fc, struct list_head *head)
{
while (!list_empty(head)) {
@@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc)
* is OK, the request will in that case be removed from the list before we touch
* it.
*/
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
+void fuse_abort_conn(struct fuse_conn *fc)
{
struct fuse_iqueue *fiq = &fc->iq;
@@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
fc->connected = 0;
spin_unlock(&fc->bg_lock);
- fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
struct fuse_pqueue *fpq = &fud->pq;
@@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
WARN_ON(fc->iq.fasync != NULL);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
}
fuse_dev_free(fud);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e909678afa2d..dd0f64f7bc06 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
args->out.args[0].value = outarg;
}
-u64 fuse_get_attr_version(struct fuse_conn *fc)
-{
- u64 curr_version;
-
- /*
- * The spin lock isn't actually needed on 64bit archs, but we
- * don't yet care too much about such optimizations.
- */
- spin_lock(&fc->lock);
- curr_version = fc->attr_version;
- spin_unlock(&fc->lock);
-
- return curr_version;
-}
-
/*
* Check whether the dentry is still valid
*
@@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
goto invalid;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
kfree(forget);
if (ret == -ENOMEM)
@@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_create_in inarg;
struct fuse_open_out outopen;
struct fuse_entry_out outentry;
+ struct fuse_inode *fi;
struct fuse_file *ff;
/* Userspace expects S_IFREG in create mode */
@@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
&outentry.attr, entry_attr_timeout(&outentry), 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
- fuse_sync_release(ff, flags);
+ fuse_sync_release(NULL, ff, flags);
fuse_queue_forget(fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
@@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
fuse_dir_changed(dir);
err = finish_open(file, entry, generic_file_open);
if (err) {
- fuse_sync_release(ff, flags);
+ fi = get_fuse_inode(inode);
+ fuse_sync_release(fi, ff, flags);
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
@@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
@@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
*/
if (inode->i_nlink > 0)
drop_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
@@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
if (!err) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
inc_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
} else if (err == -EINTR) {
@@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
*/
void fuse_set_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
BUG_ON(!inode_is_locked(inode));
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
BUG_ON(fi->writectr < 0);
fi->writectr += FUSE_NOWRITE;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
}
@@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode)
void fuse_release_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
__fuse_release_nowrite(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
@@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
goto error;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
/* the kernel maintains i_mtime locally */
if (trust_local_cmtime) {
if (attr->ia_valid & ATTR_MTIME)
@@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
- /* NOTE: this may release/reacquire fc->lock */
+ /* NOTE: this may release/reacquire fi->lock */
__fuse_release_nowrite(inode);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
/*
* Only call invalidate_inode_pages2() after removing
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a59c16bd90ac..06096b60f1df 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,8 +19,6 @@
#include <linux/falloc.h>
#include <linux/uio.h>
-static const struct file_operations fuse_direct_io_file_operations;
-
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
@@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- spin_lock(&fc->lock);
- ff->kh = ++fc->khctr;
- spin_unlock(&fc->lock);
+ ff->kh = atomic64_inc_return(&fc->khctr);
return ff;
}
@@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (ff->fc->no_open && !isdir) {
+ if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
/*
* Drop the release request when client does not
* implement 'open'
@@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
return -ENOMEM;
ff->fh = 0;
- ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
- if (!fc->no_open || isdir) {
+ /* Default for no-open */
+ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+ if (isdir ? !fc->no_opendir : !fc->no_open) {
struct fuse_open_out outarg;
int err;
@@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
- } else if (err != -ENOSYS || isdir) {
+ } else if (err != -ENOSYS) {
fuse_file_free(ff);
return err;
} else {
- fc->no_open = 1;
+ if (isdir)
+ fc->no_opendir = 1;
+ else
+ fc->no_open = 1;
}
}
@@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
static void fuse_link_write_file(struct file *file)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
/*
* file may be written through mmap, so chain it onto the
* inodes's write_file list
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (list_empty(&ff->write_entry))
list_add(&ff->write_entry, &fi->write_files);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
void fuse_finish_open(struct inode *inode, struct file *file)
@@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (ff->open_flags & FOPEN_DIRECT_IO)
- file->f_op = &fuse_direct_io_file_operations;
if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_NONSEEKABLE)
@@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
@@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
return err;
}
-static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+ int flags, int opcode)
{
struct fuse_conn *fc = ff->fc;
struct fuse_req *req = ff->reserved_req;
struct fuse_release_in *inarg = &req->misc.release.in;
+ /* Inode is NULL on error path of fuse_create_open() */
+ if (likely(fi)) {
+ spin_lock(&fi->lock);
+ list_del(&ff->write_entry);
+ spin_unlock(&fi->lock);
+ }
spin_lock(&fc->lock);
- list_del(&ff->write_entry);
if (!RB_EMPTY_NODE(&ff->polled_node))
rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
@@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
void fuse_release_common(struct file *file, bool isdir)
{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
struct fuse_req *req = ff->reserved_req;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(ff, file->f_flags, opcode);
+ fuse_prepare_release(fi, ff, file->f_flags, opcode);
if (ff->flock) {
struct fuse_release_in *inarg = &req->misc.release.in;
@@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file)
return 0;
}
-void fuse_sync_release(struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
- fuse_prepare_release(ff, flags, FUSE_RELEASE);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
* synchronous, we are fine with not doing igrab() here"
@@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
return (u64) v0 + ((u64) v1 << 32);
}
-/*
- * Check if any page in a range is under writeback
- *
- * This is currently done by walking the list of writepage requests
- * for the inode, which can be pretty inefficient.
- */
-static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
- pgoff_t idx_to)
+static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi,
+ pgoff_t idx_from, pgoff_t idx_to)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
- bool found = false;
- spin_lock(&fc->lock);
list_for_each_entry(req, &fi->writepages, writepages_entry) {
pgoff_t curr_index;
- BUG_ON(req->inode != inode);
+ WARN_ON(get_fuse_inode(req->inode) != fi);
curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + req->num_pages &&
curr_index <= idx_to) {
- found = true;
- break;
+ return req;
}
}
- spin_unlock(&fc->lock);
+ return NULL;
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool found;
+
+ spin_lock(&fi->lock);
+ found = fuse_find_writeback(fi, idx_from, idx_to);
+ spin_unlock(&fi->lock);
return found;
}
@@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
- spin_unlock(&fc->lock);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
}
io->iocb->ki_complete(io->iocb, res, 0);
@@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (attr_ver == fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- fi->attr_version = ++fc->attr_version;
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_short_read(struct fuse_req *req, struct inode *inode,
@@ -919,7 +929,7 @@ out:
return err;
}
-static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
struct fuse_inode *fi = get_fuse_inode(inode);
bool ret = false;
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
if (pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ret;
}
@@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
int err = 0;
ssize_t res = 0;
- if (is_bad_inode(inode))
- return -EIO;
-
if (inode->i_size < pos + iov_iter_count(ii))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
return res > 0 ? res : err;
}
-static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
ssize_t res;
struct inode *inode = file_inode(io->iocb->ki_filp);
- if (is_bad_inode(inode))
- return -EIO;
-
res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_atime(inode);
@@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
return res;
}
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
- return __fuse_direct_read(&io, to, &iocb->ki_pos);
+ ssize_t res;
+
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, to);
+ } else {
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
+ res = __fuse_direct_read(&io, to, &iocb->ki_pos);
+ }
+
+ return res;
}
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
- if (is_bad_inode(inode))
- return -EIO;
-
/* Don't allow parallel writes to the same file */
inode_lock(inode);
res = generic_write_checks(iocb, from);
- if (res > 0)
- res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (res > 0) {
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, from);
+ } else {
+ res = fuse_direct_io(&io, from, &iocb->ki_pos,
+ FUSE_DIO_WRITE);
+ }
+ }
fuse_invalidate_attr(inode);
if (res > 0)
fuse_write_update_size(inode, iocb->ki_pos);
@@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
return res;
}
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_read_iter(iocb, to);
+ else
+ return fuse_direct_read_iter(iocb, to);
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_write_iter(iocb, from);
+ else
+ return fuse_direct_write_iter(iocb, from);
+}
+
static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
{
int i;
@@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fi->page_waitq);
}
-/* Called under fc->lock, may release and reacquire it */
+/* Called under fi->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
loff_t size)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
+ struct fuse_req *aux, *next;
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_SIZE;
bool queued;
- if (!fc->connected)
- goto out_free;
-
if (inarg->offset + data_size <= size) {
inarg->size = data_size;
} else if (inarg->offset < size) {
@@ -1505,28 +1549,40 @@ __acquires(fc->lock)
}
req->in.args[1].size = inarg->size;
- fi->writectr++;
queued = fuse_request_queue_background(fc, req);
- WARN_ON(!queued);
+ /* Fails on broken connection only */
+ if (unlikely(!queued))
+ goto out_free;
+
+ fi->writectr++;
return;
out_free:
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
+
+ /* After fuse_writepage_finish() aux request list is private */
+ for (aux = req->misc.write.next; aux; aux = next) {
+ next = aux->misc.write.next;
+ aux->misc.write.next = NULL;
+ fuse_writepage_free(fc, aux);
+ fuse_put_request(fc, aux);
+ }
+
fuse_writepage_free(fc, req);
fuse_put_request(fc, req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
}
/*
* If fi->writectr is positive (no truncate or fsync going on) send
* all queued writepage requests.
*
- * Called with fc->lock
+ * Called with fi->lock
*/
void fuse_flush_writepages(struct inode *inode)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
struct fuse_inode *fi = get_fuse_inode(inode);
mapping_set_error(inode->i_mapping, req->out.h.error);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
while (req->misc.write.next) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &req->misc.write.in;
@@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
}
fi->writectr--;
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_writepage_free(fc, req);
}
@@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
{
struct fuse_file *ff = NULL;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (!list_empty(&fi->write_files)) {
ff = list_entry(fi->write_files.next, struct fuse_file,
write_entry);
fuse_file_get(ff);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ff;
}
@@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page)
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
end_page_writeback(page);
@@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
int num_pages = req->num_pages;
int i;
req->ff = fuse_file_get(data->ff);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
for (i = 0; i < num_pages; i++)
end_page_writeback(data->orig_pages[i]);
}
+/*
+ * First recheck under fi->lock if the offending offset is still under
+ * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * one already added for a page at this offset. If there's none, then insert
+ * this new request onto the auxiliary list, otherwise reuse the existing one by
+ * copying the new page contents over to the old temporary page.
+ */
static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct page *page)
{
@@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct fuse_inode *fi = get_fuse_inode(new_req->inode);
struct fuse_req *tmp;
struct fuse_req *old_req;
- bool found = false;
- pgoff_t curr_index;
- BUG_ON(new_req->num_pages != 0);
+ WARN_ON(new_req->num_pages != 0);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_del(&new_req->writepages_entry);
- list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
- BUG_ON(old_req->inode != new_req->inode);
- curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
- if (curr_index <= page->index &&
- page->index < curr_index + old_req->num_pages) {
- found = true;
- break;
- }
- }
- if (!found) {
+ old_req = fuse_find_writeback(fi, page->index, page->index);
+ if (!old_req) {
list_add(&new_req->writepages_entry, &fi->writepages);
- goto out_unlock;
+ spin_unlock(&fi->lock);
+ return false;
}
new_req->num_pages = 1;
- for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
- BUG_ON(tmp->inode != new_req->inode);
+ for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) {
+ pgoff_t curr_index;
+
+ WARN_ON(tmp->inode != new_req->inode);
curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
- if (tmp->num_pages == 1 &&
- curr_index == page->index) {
- old_req = tmp;
+ if (curr_index == page->index) {
+ WARN_ON(tmp->num_pages != 1);
+ WARN_ON(!test_bit(FR_PENDING, &tmp->flags));
+ swap(tmp->pages[0], new_req->pages[0]);
+ break;
}
}
- if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
- struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
+ if (!tmp) {
+ new_req->misc.write.next = old_req->misc.write.next;
+ old_req->misc.write.next = new_req;
+ }
+
+ spin_unlock(&fi->lock);
- copy_highpage(old_req->pages[0], page);
- spin_unlock(&fc->lock);
+ if (tmp) {
+ struct backing_dev_info *bdi = inode_to_bdi(new_req->inode);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
- goto out;
- } else {
- new_req->misc.write.next = old_req->misc.write.next;
- old_req->misc.write.next = new_req;
}
-out_unlock:
- spin_unlock(&fc->lock);
-out:
- return found;
+
+ return true;
}
static int fuse_writepages_fill(struct page *page,
@@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_fill_wb_data *data = _data;
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
bool is_writeback;
@@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page,
req->end = fuse_writepage_end;
req->inode = inode;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
data->req = req;
}
@@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page,
data->orig_pages[req->num_pages] = page;
/*
- * Protected by fc->lock against concurrent access by
+ * Protected by fi->lock against concurrent access by
* fuse_page_is_writeback().
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
req->num_pages++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
out_unlock:
unlock_page(page);
@@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_DIRECT_IO) {
+ /* Can't provide the coherency needed for MAP_SHARED */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+
+ invalidate_inode_pages2(file->f_mapping);
+
+ return generic_file_mmap(file, vma);
+ }
+
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
fuse_link_write_file(file);
@@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Can't provide the coherency needed for MAP_SHARED */
- if (vma->vm_flags & VM_MAYSHARE)
- return -ENODEV;
-
- invalidate_inode_pages2(file->f_mapping);
-
- return generic_file_mmap(file, vma);
-}
-
static int convert_fuse_file_lock(struct fuse_conn *fc,
const struct fuse_file_lock *ffl,
struct file_lock *fl)
@@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
@@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = {
.copy_file_range = fuse_copy_file_range,
};
-static const struct file_operations fuse_direct_io_file_operations = {
- .llseek = fuse_file_llseek,
- .read_iter = fuse_direct_read_iter,
- .write_iter = fuse_direct_write_iter,
- .mmap = fuse_direct_mmap,
- .open = fuse_open,
- .flush = fuse_flush,
- .release = fuse_release,
- .fsync = fuse_fsync,
- .lock = fuse_file_lock,
- .flock = fuse_file_flock,
- .unlocked_ioctl = fuse_file_ioctl,
- .compat_ioctl = fuse_file_compat_ioctl,
- .poll = fuse_file_poll,
- .fallocate = fuse_file_fallocate,
- /* no splice_read */
-};
-
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
.writepage = fuse_writepage,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2f2c92e6f8cb..0920c0c032a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -96,7 +96,7 @@ struct fuse_inode {
union {
/* Write related fields (regular file only) */
struct {
- /* Files usable in writepage. Protected by fc->lock */
+ /* Files usable in writepage. Protected by fi->lock */
struct list_head write_files;
/* Writepages pending on truncate or fsync */
@@ -144,6 +144,9 @@ struct fuse_inode {
/** Lock for serializing lookup and readdir for back compatibility*/
struct mutex mutex;
+
+ /** Lock to protect write related fields */
+ spinlock_t lock;
};
/** FUSE inode state bits */
@@ -163,7 +166,10 @@ struct fuse_file {
/** Fuse connection for this file */
struct fuse_conn *fc;
- /** Request reserved for flush and release */
+ /*
+ * Request reserved for flush and release.
+ * Modified under relative fuse_inode::lock.
+ */
struct fuse_req *reserved_req;
/** Kernel file handle guaranteed to be unique */
@@ -538,7 +544,7 @@ struct fuse_conn {
struct fuse_iqueue iq;
/** The next unique kernel file handle */
- u64 khctr;
+ atomic64_t khctr;
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
@@ -624,6 +630,9 @@ struct fuse_conn {
/** Is open/release not implemented by fs? */
unsigned no_open:1;
+ /** Is opendir/releasedir not implemented by fs? */
+ unsigned no_opendir:1;
+
/** Is fsync not implemented by fs? */
unsigned no_fsync:1;
@@ -730,7 +739,7 @@ struct fuse_conn {
struct fuse_req *destroy_req;
/** Version counter for attribute changes */
- u64 attr_version;
+ atomic64_t attr_version;
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid)
return !nodeid || nodeid == FUSE_ROOT_ID;
}
+static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+ return atomic64_read(&fc->attr_version);
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
-void fuse_sync_release(struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
@@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
/* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
+void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
/**
@@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode);
void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
-u64 fuse_get_attr_version(struct fuse_conn *fc);
-
/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c2d4099429be..ec5d9953dfb6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
kmem_cache_free(fuse_inode_cachep, inode);
@@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- fi->attr_version = ++fc->attr_version;
+ lockdep_assert_held(&fi->lock);
+
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
fi->i_time = attr_valid;
WRITE_ONCE(fi->inval_mask, 0);
@@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
loff_t oldsize;
struct timespec64 old_mtime;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return;
}
@@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
*/
if (!is_wb || !S_ISREG(inode->i_mode))
i_size_write(inode, attr->size);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
if (!is_wb && S_ISREG(inode->i_mode)) {
bool inval = false;
@@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_change_attributes(inode, attr, attr_valid, attr_version);
return inode;
@@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
static void fuse_umount_begin(struct super_block *sb)
{
- fuse_abort_conn(get_fuse_conn_super(sb), false);
+ fuse_abort_conn(get_fuse_conn_super(sb));
}
static void fuse_send_destroy(struct fuse_conn *fc)
@@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
atomic_set(&fc->num_waiting, 0);
fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
- fc->khctr = 0;
+ atomic64_set(&fc->khctr, 0);
fc->polled_files = RB_ROOT;
fc->blocked = 0;
fc->initialized = 0;
fc->connected = 1;
- fc->attr_version = 1;
+ atomic64_set(&fc->attr_version, 1);
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
@@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
- FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+ FUSE_NO_OPENDIR_SUPPORT;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
@@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb)
if (fc) {
fuse_send_destroy(fc);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
fuse_wait_aborted(fc);
down_write(&fc->killsb);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index ab18b78f4755..574d03f8a573 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -213,9 +213,9 @@ retry:
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
forget_all_cached_acls(inode);
fuse_change_attributes(inode, &o->attr,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a2dea5bc0427..58a768e59712 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b92740edc416..d32964cd1117 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
{
- u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+ u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
}
@@ -2131,71 +2131,29 @@ static const struct file_operations gfs2_sbstats_fops = {
.release = seq_release,
};
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
-{
- struct dentry *dent;
-
- dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dir = dent;
-
- dent = debugfs_create_file("glocks",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glocks_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glocks = dent;
-
- dent = debugfs_create_file("glstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glstats = dent;
-
- dent = debugfs_create_file("sbstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_sbstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_sbstats = dent;
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+ sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- return 0;
-fail:
- gfs2_delete_debugfs_file(sdp);
- return dent ? PTR_ERR(dent) : -ENOMEM;
+ debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glocks_fops);
+
+ debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glstats_fops);
+
+ debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_sbstats_fops);
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
{
- if (sdp->debugfs_dir) {
- if (sdp->debugfs_dentry_glocks) {
- debugfs_remove(sdp->debugfs_dentry_glocks);
- sdp->debugfs_dentry_glocks = NULL;
- }
- if (sdp->debugfs_dentry_glstats) {
- debugfs_remove(sdp->debugfs_dentry_glstats);
- sdp->debugfs_dentry_glstats = NULL;
- }
- if (sdp->debugfs_dentry_sbstats) {
- debugfs_remove(sdp->debugfs_dentry_sbstats);
- sdp->debugfs_dentry_sbstats = NULL;
- }
- debugfs_remove(sdp->debugfs_dir);
- sdp->debugfs_dir = NULL;
- }
+ debugfs_remove_recursive(sdp->debugfs_dir);
+ sdp->debugfs_dir = NULL;
}
-int gfs2_register_debugfs(void)
+void gfs2_register_debugfs(void)
{
gfs2_root = debugfs_create_dir("gfs2", NULL);
- if (IS_ERR(gfs2_root))
- return PTR_ERR(gfs2_root);
- return gfs2_root ? 0 : -ENOMEM;
}
void gfs2_unregister_debugfs(void)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 8949bf28b249..936b3295839c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -243,9 +243,9 @@ extern void gfs2_glock_free(struct gfs2_glock *gl);
extern int __init gfs2_glock_init(void);
extern void gfs2_glock_exit(void);
-extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern int gfs2_register_debugfs(void);
+extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e10e0b0a7cd5..cdf07b408f54 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -853,9 +853,6 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
- struct dentry *debugfs_dentry_glocks;
- struct dentry *debugfs_dentry_glstats;
- struct dentry *debugfs_dentry_sbstats;
};
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 793808263c6d..18d4af7417fa 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -59,8 +59,8 @@ static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
- change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+ change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+ gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 2295042bc625..8722c60b11fe 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
* that is pinned in the pagecache.
*/
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+ struct bio_vec *bvec,
blk_status_t error)
{
struct buffer_head *bh, *next;
@@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
wake_up(&sdp->sd_logd_waitq);
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c7603063f861..136484ef35d3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -178,16 +178,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_page_pool)
goto fail_mempool;
- error = gfs2_register_debugfs();
- if (error)
- goto fail_debugfs;
+ gfs2_register_debugfs();
pr_info("GFS2 installed\n");
return 0;
-fail_debugfs:
- mempool_destroy(gfs2_page_pool);
fail_mempool:
destroy_workqueue(gfs2_freeze_wq);
fail_wq3:
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf697fe..3201342404a7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 823a328791c0..302f45101a96 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -120,11 +120,11 @@ struct hpfs_spare_block
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
u8 bad_bitmap: 1; /* bad bitmap */
u8 fast: 1; /* partition was fast formatted */
- u8 old_wrote: 1; /* old version wrote to partion */
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
#else
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
- u8 old_wrote: 1; /* old version wrote to partion */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
u8 fast: 1; /* partition was fast formatted */
u8 bad_bitmap: 1; /* bad bitmap */
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a10100e..ec32fece5e1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -27,7 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
@@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
-struct hugetlbfs_config {
+enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+struct hugetlbfs_fs_context {
struct hstate *hstate;
+ unsigned long long max_size_opt;
+ unsigned long long min_size_opt;
long max_hpages;
long nr_inodes;
long min_hpages;
+ enum hugetlbfs_size_type max_val_type;
+ enum hugetlbfs_size_type min_val_type;
kuid_t uid;
kgid_t gid;
umode_t mode;
@@ -57,22 +63,30 @@ struct hugetlbfs_config {
int sysctl_hugetlb_shm_group;
-enum {
- Opt_size, Opt_nr_inodes,
- Opt_mode, Opt_uid, Opt_gid,
- Opt_pagesize, Opt_min_size,
- Opt_err,
+enum hugetlb_param {
+ Opt_gid,
+ Opt_min_size,
+ Opt_mode,
+ Opt_nr_inodes,
+ Opt_pagesize,
+ Opt_size,
+ Opt_uid,
};
-static const match_table_t tokens = {
- {Opt_size, "size=%s"},
- {Opt_nr_inodes, "nr_inodes=%s"},
- {Opt_mode, "mode=%o"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_pagesize, "pagesize=%s"},
- {Opt_min_size, "min_size=%s"},
- {Opt_err, NULL},
+static const struct fs_parameter_spec hugetlb_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_string("min_size", Opt_min_size),
+ fsparam_u32 ("mode", Opt_mode),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("pagesize", Opt_pagesize),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+static const struct fs_parameter_description hugetlb_fs_parameters = {
+ .name = "hugetlbfs",
+ .specs = hugetlb_param_specs,
};
#ifdef CONFIG_NUMA
@@ -530,7 +544,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
@@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
}
static struct inode *hugetlbfs_get_root(struct super_block *sb,
- struct hugetlbfs_config *config)
+ struct hugetlbfs_fs_context *ctx)
{
struct inode *inode;
inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mode = S_IFDIR | config->mode;
- inode->i_uid = config->uid;
- inode->i_gid = config->gid;
+ inode->i_mode = S_IFDIR | ctx->mode;
+ inode->i_uid = ctx->uid;
+ inode->i_gid = ctx->gid;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -859,6 +873,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
+
+ /*
+ * page_private is subpool pointer in hugetlb pages. Transfer to
+ * new page. PagePrivate is not associated with page_private for
+ * hugetlb pages and can not be set here as only page_huge_active
+ * pages can be migrated.
+ */
+ if (page_private(page)) {
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ }
+
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
@@ -1081,8 +1107,6 @@ static const struct super_operations hugetlbfs_ops = {
.show_options = hugetlbfs_show_options,
};
-enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
-
/*
* Convert size option passed from command line to number of huge pages
* in the pool specified by hstate. Size option could be in bytes
@@ -1105,170 +1129,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
return size_opt;
}
-static int
-hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+/*
+ * Parse one mount parameter.
+ */
+static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p, *rest;
- substring_t args[MAX_OPT_ARGS];
- int option;
- unsigned long long max_size_opt = 0, min_size_opt = 0;
- enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE;
-
- if (!options)
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ char *rest;
+ unsigned long ps;
+ int opt;
+
+ opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_val;
return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_val;
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(pconfig->uid))
- goto bad_val;
- break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 01777U;
+ return 0;
- case Opt_gid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(pconfig->gid))
- goto bad_val;
- break;
+ case Opt_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->max_size_opt = memparse(param->string, &rest);
+ ctx->max_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->max_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- goto bad_val;
- pconfig->mode = option & 01777U;
- break;
+ case Opt_nr_inodes:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->nr_inodes = memparse(param->string, &rest);
+ return 0;
- case Opt_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- max_size_opt = memparse(args[0].from, &rest);
- max_val_type = SIZE_STD;
- if (*rest == '%')
- max_val_type = SIZE_PERCENT;
- break;
+ case Opt_pagesize:
+ ps = memparse(param->string, &rest);
+ ctx->hstate = size_to_hstate(ps);
+ if (!ctx->hstate) {
+ pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ return -EINVAL;
}
+ return 0;
- case Opt_nr_inodes:
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- pconfig->nr_inodes = memparse(args[0].from, &rest);
- break;
+ case Opt_min_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->min_size_opt = memparse(param->string, &rest);
+ ctx->min_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->min_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_pagesize: {
- unsigned long ps;
- ps = memparse(args[0].from, &rest);
- pconfig->hstate = size_to_hstate(ps);
- if (!pconfig->hstate) {
- pr_err("Unsupported page size %lu MB\n",
- ps >> 20);
- return -EINVAL;
- }
- break;
- }
+ default:
+ return -EINVAL;
+ }
- case Opt_min_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- min_size_opt = memparse(args[0].from, &rest);
- min_val_type = SIZE_STD;
- if (*rest == '%')
- min_val_type = SIZE_PERCENT;
- break;
- }
+bad_val:
+ return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+ param->string, param->key);
+}
- default:
- pr_err("Bad mount option: \"%s\"\n", p);
- return -EINVAL;
- break;
- }
- }
+/*
+ * Validate the parsed options.
+ */
+static int hugetlbfs_validate(struct fs_context *fc)
+{
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
/*
* Use huge page pool size (in hstate) to convert the size
* options to number of huge pages. If NO_SIZE, -1 is returned.
*/
- pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- max_size_opt, max_val_type);
- pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- min_size_opt, min_val_type);
+ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->max_size_opt,
+ ctx->max_val_type);
+ ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->min_size_opt,
+ ctx->min_val_type);
/*
* If max_size was specified, then min_size must be smaller
*/
- if (max_val_type > NO_SIZE &&
- pconfig->min_hpages > pconfig->max_hpages) {
- pr_err("minimum size can not be greater than maximum size\n");
+ if (ctx->max_val_type > NO_SIZE &&
+ ctx->min_hpages > ctx->max_hpages) {
+ pr_err("Minimum size can not be greater than maximum size\n");
return -EINVAL;
}
return 0;
-
-bad_val:
- pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
- return -EINVAL;
}
static int
-hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- int ret;
- struct hugetlbfs_config config;
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct hugetlbfs_sb_info *sbinfo;
- config.max_hpages = -1; /* No limit on size by default */
- config.nr_inodes = -1; /* No limit on number of inodes by default */
- config.uid = current_fsuid();
- config.gid = current_fsgid();
- config.mode = 0755;
- config.hstate = &default_hstate;
- config.min_hpages = -1; /* No default minimum size */
- ret = hugetlbfs_parse_options(data, &config);
- if (ret)
- return ret;
-
sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
- sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_inodes = config.nr_inodes;
- sbinfo->free_inodes = config.nr_inodes;
- sbinfo->spool = NULL;
- sbinfo->uid = config.uid;
- sbinfo->gid = config.gid;
- sbinfo->mode = config.mode;
+ sbinfo->hstate = ctx->hstate;
+ sbinfo->max_inodes = ctx->nr_inodes;
+ sbinfo->free_inodes = ctx->nr_inodes;
+ sbinfo->spool = NULL;
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->mode = ctx->mode;
/*
* Allocate and initialize subpool if maximum or minimum size is
* specified. Any needed reservations (for minimim size) are taken
* taken when the subpool is created.
*/
- if (config.max_hpages != -1 || config.min_hpages != -1) {
- sbinfo->spool = hugepage_new_subpool(config.hstate,
- config.max_hpages,
- config.min_hpages);
+ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
+ sbinfo->spool = hugepage_new_subpool(ctx->hstate,
+ ctx->max_hpages,
+ ctx->min_hpages);
if (!sbinfo->spool)
goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = huge_page_size(config.hstate);
- sb->s_blocksize_bits = huge_page_shift(config.hstate);
+ sb->s_blocksize = huge_page_size(ctx->hstate);
+ sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+ sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
return 0;
@@ -1278,16 +1283,52 @@ out_free:
return -ENOMEM;
}
-static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_tree(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+ int err = hugetlbfs_validate(fc);
+ if (err)
+ return err;
+ return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+}
+
+static void hugetlbfs_fs_context_free(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations hugetlbfs_fs_context_ops = {
+ .free = hugetlbfs_fs_context_free,
+ .parse_param = hugetlbfs_parse_param,
+ .get_tree = hugetlbfs_get_tree,
+};
+
+static int hugetlbfs_init_fs_context(struct fs_context *fc)
+{
+ struct hugetlbfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->max_hpages = -1; /* No limit on size by default */
+ ctx->nr_inodes = -1; /* No limit on number of inodes by default */
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+ ctx->mode = 0755;
+ ctx->hstate = &default_hstate;
+ ctx->min_hpages = -1; /* No default minimum size */
+ ctx->max_val_type = NO_SIZE;
+ ctx->min_val_type = NO_SIZE;
+ fc->fs_private = ctx;
+ fc->ops = &hugetlbfs_fs_context_ops;
+ return 0;
}
static struct file_system_type hugetlbfs_fs_type = {
- .name = "hugetlbfs",
- .mount = hugetlbfs_mount,
- .kill_sb = kill_litter_super,
+ .name = "hugetlbfs",
+ .init_fs_context = hugetlbfs_init_fs_context,
+ .parameters = &hugetlb_fs_parameters,
+ .kill_sb = kill_litter_super,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1372,8 +1413,29 @@ out:
return file;
}
+static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+
+ fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc)) {
+ mnt = ERR_CAST(fc);
+ } else {
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ ctx->hstate = h;
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
+ }
+ if (IS_ERR(mnt))
+ pr_err("Cannot mount internal hugetlbfs for page size %uK",
+ 1U << (h->order + PAGE_SHIFT - 10));
+ return mnt;
+}
+
static int __init init_hugetlbfs_fs(void)
{
+ struct vfsmount *mnt;
struct hstate *h;
int error;
int i;
@@ -1396,24 +1458,16 @@ static int __init init_hugetlbfs_fs(void)
i = 0;
for_each_hstate(h) {
- char buf[50];
- unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-
- snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
- hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
- buf);
-
- if (IS_ERR(hugetlbfs_vfsmount[i])) {
- pr_err("Cannot mount internal hugetlbfs for "
- "page size %uK", ps_kb);
- error = PTR_ERR(hugetlbfs_vfsmount[i]);
- hugetlbfs_vfsmount[i] = NULL;
+ mnt = mount_one_hugetlbfs(h);
+ if (IS_ERR(mnt) && i == 0) {
+ error = PTR_ERR(mnt);
+ goto out;
}
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
- /* Non default hstates are optional */
- if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
- return 0;
+
+ return 0;
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
diff --git a/fs/inode.c b/fs/inode.c
index 73432e64f874..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/internal.h b/fs/internal.h
index d410186bc369..6a8b71643af4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
+struct fs_context;
/*
* block_dev.c
@@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
extern void __init chrdev_init(void);
/*
+ * fs_context.c
+ */
+extern int parse_monolithic_mount_data(struct fs_context *, void *);
+extern void fc_drop_locked(struct fs_context *);
+
+/*
* namei.c
*/
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
/*
* super.c
*/
-extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
-extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..c88088d92613
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,2971 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side. When the application reads the CQ ring
+ * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
+ * the kernel uses after writing the tail. Failure to do so could cause a
+ * delay in when the application notices that completion events available.
+ * This isn't a fatal condition. Likewise, the application must use an
+ * appropriate smp_wmb() both before writing the SQ tail, and after writing
+ * the SQ tail. The first one orders the sqe writes with the tail write, and
+ * the latter is paired with the smp_rmb() the kernel will issue before
+ * reading the SQ tail on submission.
+ *
+ * Also see the examples in the liburing library:
+ *
+ * git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/bvec.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES 4096
+#define IORING_MAX_FIXED_FILES 1024
+
+struct io_uring {
+ u32 head ____cacheline_aligned_in_smp;
+ u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct io_sq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 dropped;
+ u32 flags;
+ u32 array[];
+};
+
+struct io_cq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 overflow;
+ struct io_uring_cqe cqes[];
+};
+
+struct io_mapped_ubuf {
+ u64 ubuf;
+ size_t len;
+ struct bio_vec *bvec;
+ unsigned int nr_bvecs;
+};
+
+struct async_list {
+ spinlock_t lock;
+ atomic_t cnt;
+ struct list_head list;
+
+ struct file *file;
+ off_t io_end;
+ size_t io_pages;
+};
+
+struct io_ring_ctx {
+ struct {
+ struct percpu_ref refs;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned int flags;
+ bool compat;
+ bool account_mem;
+
+ /* SQ ring */
+ struct io_sq_ring *sq_ring;
+ unsigned cached_sq_head;
+ unsigned sq_entries;
+ unsigned sq_mask;
+ unsigned sq_thread_idle;
+ struct io_uring_sqe *sq_sqes;
+ } ____cacheline_aligned_in_smp;
+
+ /* IO offload */
+ struct workqueue_struct *sqo_wq;
+ struct task_struct *sqo_thread; /* if using sq thread polling */
+ struct mm_struct *sqo_mm;
+ wait_queue_head_t sqo_wait;
+ unsigned sqo_stop;
+
+ struct {
+ /* CQ ring */
+ struct io_cq_ring *cq_ring;
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ unsigned cq_mask;
+ struct wait_queue_head cq_wait;
+ struct fasync_struct *cq_fasync;
+ } ____cacheline_aligned_in_smp;
+
+ /*
+ * If used, fixed file set. Writers must ensure that ->refs is dead,
+ * readers must ensure that ->refs is alive as long as the file* is
+ * used. Only updated through io_uring_register(2).
+ */
+ struct file **user_files;
+ unsigned nr_user_files;
+
+ /* if used, fixed mapped user buffers */
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf *user_bufs;
+
+ struct user_struct *user;
+
+ struct completion ctx_done;
+
+ struct {
+ struct mutex uring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t completion_lock;
+ bool poll_multi_file;
+ /*
+ * ->poll_list is protected by the ctx->uring_lock for
+ * io_uring instances that don't use IORING_SETUP_SQPOLL.
+ * For SQPOLL, only the single threaded io_sq_thread() will
+ * manipulate the list, hence no extra locking is needed there.
+ */
+ struct list_head poll_list;
+ struct list_head cancel_list;
+ } ____cacheline_aligned_in_smp;
+
+ struct async_list pending_async[2];
+
+#if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+#endif
+};
+
+struct sqe_submit {
+ const struct io_uring_sqe *sqe;
+ unsigned short index;
+ bool has_user;
+ bool needs_lock;
+ bool needs_fixed_file;
+};
+
+struct io_poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool woken;
+ bool canceled;
+ struct wait_queue_entry wait;
+};
+
+struct io_kiocb {
+ union {
+ struct kiocb rw;
+ struct io_poll_iocb poll;
+ };
+
+ struct sqe_submit submit;
+
+ struct io_ring_ctx *ctx;
+ struct list_head list;
+ unsigned int flags;
+ refcount_t refs;
+#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
+#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
+#define REQ_F_FIXED_FILE 4 /* ctx owns file */
+#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+ u64 user_data;
+ u64 error;
+
+ struct work_struct work;
+};
+
+#define IO_PLUG_THRESHOLD 2
+#define IO_IOPOLL_BATCH 8
+
+struct io_submit_state {
+ struct blk_plug plug;
+
+ /*
+ * io_kiocb alloc cache
+ */
+ void *reqs[IO_IOPOLL_BATCH];
+ unsigned int free_reqs;
+ unsigned int cur_req;
+
+ /*
+ * File reference cache
+ */
+ struct file *file;
+ unsigned int fd;
+ unsigned int has_refs;
+ unsigned int used_refs;
+ unsigned int ios_left;
+};
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+ if (file->f_op == &io_uring_fops) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return ctx->ring_sock->sk;
+ }
+#endif
+ return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+ struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+ complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx;
+ int i;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+ kfree(ctx);
+ return NULL;
+ }
+
+ ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->cq_wait);
+ init_completion(&ctx->ctx_done);
+ mutex_init(&ctx->uring_lock);
+ init_waitqueue_head(&ctx->wait);
+ for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
+ spin_lock_init(&ctx->pending_async[i].lock);
+ INIT_LIST_HEAD(&ctx->pending_async[i].list);
+ atomic_set(&ctx->pending_async[i].cnt, 0);
+ }
+ spin_lock_init(&ctx->completion_lock);
+ INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->cancel_list);
+ return ctx;
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+
+ if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+ /* order cqe stores with ring update */
+ smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+
+ /*
+ * Write sider barrier of tail update, app has read side. See
+ * comment at the top of this file.
+ */
+ smp_wmb();
+
+ if (wq_has_sleeper(&ctx->cq_wait)) {
+ wake_up_interruptible(&ctx->cq_wait);
+ kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+ }
+ }
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ unsigned tail;
+
+ tail = ctx->cached_cq_tail;
+ /* See comment at the top of the file */
+ smp_rmb();
+ if (tail + 1 == READ_ONCE(ring->r.head))
+ return NULL;
+
+ ctx->cached_cq_tail++;
+ return &ring->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ struct io_uring_cqe *cqe;
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqring(ctx);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, ki_user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, ev_flags);
+ } else {
+ unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+
+ WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+ }
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+}
+
+static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+{
+ percpu_ref_put_many(&ctx->refs, refs);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+
+ if (!percpu_ref_tryget(&ctx->refs))
+ return NULL;
+
+ if (!state) {
+ req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+ if (unlikely(!req))
+ goto out;
+ } else if (!state->free_reqs) {
+ size_t sz;
+ int ret;
+
+ sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
+ ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
+ state->reqs);
+ if (unlikely(ret <= 0))
+ goto out;
+ state->free_reqs = ret - 1;
+ state->cur_req = 1;
+ req = state->reqs[0];
+ } else {
+ req = state->reqs[state->cur_req];
+ state->free_reqs--;
+ state->cur_req++;
+ }
+
+ req->ctx = ctx;
+ req->flags = 0;
+ refcount_set(&req->refs, 0);
+ return req;
+out:
+ io_ring_drop_ctx_refs(ctx, 1);
+ return NULL;
+}
+
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+ if (*nr) {
+ kmem_cache_free_bulk(req_cachep, *nr, reqs);
+ io_ring_drop_ctx_refs(ctx, *nr);
+ *nr = 0;
+ }
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+ if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
+ io_ring_drop_ctx_refs(req->ctx, 1);
+ kmem_cache_free(req_cachep, req);
+ }
+}
+
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ struct list_head *done)
+{
+ void *reqs[IO_IOPOLL_BATCH];
+ int file_count, to_free;
+ struct file *file = NULL;
+ struct io_kiocb *req;
+
+ file_count = to_free = 0;
+ while (!list_empty(done)) {
+ req = list_first_entry(done, struct io_kiocb, list);
+ list_del(&req->list);
+
+ io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+
+ reqs[to_free++] = req;
+ (*nr_events)++;
+
+ /*
+ * Batched puts of the same file, to avoid dirtying the
+ * file usage count multiple times, if avoidable.
+ */
+ if (!(req->flags & REQ_F_FIXED_FILE)) {
+ if (!file) {
+ file = req->rw.ki_filp;
+ file_count = 1;
+ } else if (file == req->rw.ki_filp) {
+ file_count++;
+ } else {
+ fput_many(file, file_count);
+ file = req->rw.ki_filp;
+ file_count = 1;
+ }
+ }
+
+ if (to_free == ARRAY_SIZE(reqs))
+ io_free_req_many(ctx, reqs, &to_free);
+ }
+ io_commit_cqring(ctx);
+
+ if (file)
+ fput_many(file, file_count);
+ io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ struct io_kiocb *req, *tmp;
+ LIST_HEAD(done);
+ bool spin;
+ int ret;
+
+ /*
+ * Only spin for completions if we don't have multiple devices hanging
+ * off our complete list, and we're under the requested amount.
+ */
+ spin = !ctx->poll_multi_file && *nr_events < min;
+
+ ret = 0;
+ list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+ struct kiocb *kiocb = &req->rw;
+
+ /*
+ * Move completed entries to our local list. If we find a
+ * request that requires polling, break out and complete
+ * the done list first, if we have entries there.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ list_move_tail(&req->list, &done);
+ continue;
+ }
+ if (!list_empty(&done))
+ break;
+
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ if (ret < 0)
+ break;
+
+ if (ret && spin)
+ spin = false;
+ ret = 0;
+ }
+
+ if (!list_empty(&done))
+ io_iopoll_complete(ctx, nr_events, &done);
+
+ return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ while (!list_empty(&ctx->poll_list)) {
+ int ret;
+
+ ret = io_do_iopoll(ctx, nr_events, min);
+ if (ret < 0)
+ return ret;
+ if (!min || *nr_events >= min)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_IOPOLL))
+ return;
+
+ mutex_lock(&ctx->uring_lock);
+ while (!list_empty(&ctx->poll_list)) {
+ unsigned int nr_events = 0;
+
+ io_iopoll_getevents(ctx, &nr_events, 1);
+ }
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
+{
+ int ret = 0;
+
+ do {
+ int tmin = 0;
+
+ if (*nr_events < min)
+ tmin = min - *nr_events;
+
+ ret = io_iopoll_getevents(ctx, nr_events, tmin);
+ if (ret <= 0)
+ break;
+ ret = 0;
+ } while (min && !*nr_events && !need_resched());
+
+ return ret;
+}
+
+static void kiocb_end_write(struct kiocb *kiocb)
+{
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+}
+
+static void io_fput(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ fput(req->rw.ki_filp);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, req->user_data, res, 0);
+ io_free_req(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ req->error = res;
+ if (res != -EAGAIN)
+ req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /*
+ * Track whether we have multiple files in our lists. This will impact
+ * how we do polling eventually, not spinning if we're on potentially
+ * different devices.
+ */
+ if (list_empty(&ctx->poll_list)) {
+ ctx->poll_multi_file = false;
+ } else if (!ctx->poll_multi_file) {
+ struct io_kiocb *list_req;
+
+ list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+ list);
+ if (list_req->rw.ki_filp != req->rw.ki_filp)
+ ctx->poll_multi_file = true;
+ }
+
+ /*
+ * For fast devices, IO may have already completed. If it has, add
+ * it to the front so we find it first.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ list_add(&req->list, &ctx->poll_list);
+ else
+ list_add_tail(&req->list, &ctx->poll_list);
+}
+
+static void io_file_put(struct io_submit_state *state, struct file *file)
+{
+ if (!state) {
+ fput(file);
+ } else if (state->file) {
+ int diff = state->has_refs - state->used_refs;
+
+ if (diff)
+ fput_many(state->file, diff);
+ state->file = NULL;
+ }
+}
+
+/*
+ * Get as many references to a file as we have IOs left in this submission,
+ * assuming most submissions are for one file, or at least that each file
+ * has more than one submission.
+ */
+static struct file *io_file_get(struct io_submit_state *state, int fd)
+{
+ if (!state)
+ return fget(fd);
+
+ if (state->file) {
+ if (state->fd == fd) {
+ state->used_refs++;
+ state->ios_left--;
+ return state->file;
+ }
+ io_file_put(state, NULL);
+ }
+ state->file = fget_many(fd, state->ios_left);
+ if (!state->file)
+ return NULL;
+
+ state->fd = fd;
+ state->has_refs = state->ios_left;
+ state->used_refs = 1;
+ state->ios_left--;
+ return state->file;
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode))
+ return true;
+ if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+ return true;
+
+ return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct kiocb *kiocb = &req->rw;
+ unsigned ioprio, flags;
+ int fd, ret;
+
+ /* For -EAGAIN retry, everything is already prepped */
+ if (kiocb->ki_filp)
+ return 0;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files ||
+ (unsigned) fd >= ctx->nr_user_files))
+ return -EBADF;
+ kiocb->ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ if (s->needs_fixed_file)
+ return -EBADF;
+ kiocb->ki_filp = io_file_get(state, fd);
+ if (unlikely(!kiocb->ki_filp))
+ return -EBADF;
+ if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+ force_nonblock = false;
+ }
+ kiocb->ki_pos = READ_ONCE(sqe->off);
+ kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ goto out_fput;
+
+ kiocb->ki_ioprio = ioprio;
+ } else
+ kiocb->ki_ioprio = get_current_ioprio();
+
+ ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+ if (unlikely(ret))
+ goto out_fput;
+ if (force_nonblock) {
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ req->flags |= REQ_F_FORCE_NONBLOCK;
+ }
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ ret = -EOPNOTSUPP;
+ if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+ !kiocb->ki_filp->f_op->iopoll)
+ goto out_fput;
+
+ req->error = 0;
+ kiocb->ki_flags |= IOCB_HIPRI;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+ kiocb->ki_complete = io_complete_rw;
+ }
+ return 0;
+out_fput:
+ if (!(flags & IOSQE_FIXED_FILE)) {
+ /*
+ * in case of error, we didn't use this file reference. drop it.
+ */
+ if (state)
+ state->used_refs--;
+ io_file_put(state, kiocb->ki_filp);
+ }
+ return ret;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+ switch (ret) {
+ case -EIOCBQUEUED:
+ break;
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /*
+ * We can't just restart the syscall, since previously
+ * submitted sqes may already be in progress. Just fail this
+ * IO with EINTR.
+ */
+ ret = -EINTR;
+ /* fall through */
+ default:
+ kiocb->ki_complete(kiocb, ret, 0);
+ }
+}
+
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
+{
+ size_t len = READ_ONCE(sqe->len);
+ struct io_mapped_ubuf *imu;
+ unsigned index, buf_index;
+ size_t offset;
+ u64 buf_addr;
+
+ /* attempt to use fixed buffers without having provided iovecs */
+ if (unlikely(!ctx->user_bufs))
+ return -EFAULT;
+
+ buf_index = READ_ONCE(sqe->buf_index);
+ if (unlikely(buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = &ctx->user_bufs[index];
+ buf_addr = READ_ONCE(sqe->addr);
+
+ /* overflow */
+ if (buf_addr + len < buf_addr)
+ return -EFAULT;
+ /* not inside the mapped region */
+ if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+ return -EFAULT;
+
+ /*
+ * May not be a start of buffer, set size appropriately
+ * and advance us to the beginning.
+ */
+ offset = buf_addr - imu->ubuf;
+ iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+ if (offset)
+ iov_iter_advance(iter, offset);
+ return 0;
+}
+
+static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
+ const struct sqe_submit *s, struct iovec **iovec,
+ struct iov_iter *iter)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ size_t sqe_len = READ_ONCE(sqe->len);
+ u8 opcode;
+
+ /*
+ * We're reading ->opcode for the second time, but the first read
+ * doesn't care whether it's _FIXED or not, so it doesn't matter
+ * whether ->opcode changes concurrently. The first read does care
+ * about whether it is a READ or a WRITE, so we don't trust this read
+ * for that purpose and instead let the caller pass in the read/write
+ * flag.
+ */
+ opcode = READ_ONCE(sqe->opcode);
+ if (opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED) {
+ ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ *iovec = NULL;
+ return ret;
+ }
+
+ if (!s->has_user)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat)
+ return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+ iovec, iter);
+#endif
+
+ return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+/*
+ * Make a note of the last file/offset/direction we punted to async
+ * context. We'll use this information to see if we can piggy back a
+ * sequential request onto the previous one, if it's still hasn't been
+ * completed by the async worker.
+ */
+static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
+{
+ struct async_list *async_list = &req->ctx->pending_async[rw];
+ struct kiocb *kiocb = &req->rw;
+ struct file *filp = kiocb->ki_filp;
+ off_t io_end = kiocb->ki_pos + len;
+
+ if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
+ unsigned long max_pages;
+
+ /* Use 8x RA size as a decent limiter for both reads/writes */
+ max_pages = filp->f_ra.ra_pages;
+ if (!max_pages)
+ max_pages = VM_READAHEAD_PAGES;
+ max_pages *= 8;
+
+ /* If max pages are exceeded, reset the state */
+ len >>= PAGE_SHIFT;
+ if (async_list->io_pages + len <= max_pages) {
+ req->flags |= REQ_F_SEQ_PREV;
+ async_list->io_pages += len;
+ } else {
+ io_end = 0;
+ async_list->io_pages = 0;
+ }
+ }
+
+ /* New file? Reset state. */
+ if (async_list->file != filp) {
+ async_list->io_pages = 0;
+ async_list->file = filp;
+ }
+ async_list->io_end = io_end;
+}
+
+static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+ file = kiocb->ki_filp;
+
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->read_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+ ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ ssize_t ret2;
+
+ /* Catch -EAGAIN return for forced non-blocking submission */
+ ret2 = call_read_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(READ, req, iov_count);
+ ret = -EAGAIN;
+ }
+ }
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+
+ ret = -EBADF;
+ file = kiocb->ki_filp;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->write_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+
+ ret = -EAGAIN;
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
+ /* If ->needs_lock is true, we're already in async context. */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ goto out_free;
+ }
+
+ ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ /*
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * io_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
+ */
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE, true);
+ __sb_writers_release(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE);
+ }
+ kiocb->ki_flags |= IOCB_WRITE;
+ io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+ }
+out_free:
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ long err = 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ /*
+ * Twilight zone - it's possible that someone issued an opcode that
+ * has a file attached, then got -EAGAIN on submission, and changed
+ * the sqe before we retried it from async context. Avoid dropping
+ * a file reference for this malicious case, and flag the error.
+ */
+ if (req->rw.ki_filp) {
+ err = -EBADF;
+ io_fput(req);
+ }
+ io_cqring_add_event(ctx, user_data, err, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags;
+ int fd;
+
+ /* Prep already done */
+ if (req->rw.ki_filp)
+ return 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+
+ fd = READ_ONCE(sqe->fd);
+ flags = READ_ONCE(sqe->flags);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ req->rw.ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ req->rw.ki_filp = fget(fd);
+ if (unlikely(!req->rw.ki_filp))
+ return -EBADF;
+ }
+
+ return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
+{
+ loff_t sqe_off = READ_ONCE(sqe->off);
+ loff_t sqe_len = READ_ONCE(sqe->len);
+ loff_t end = sqe_off + sqe_len;
+ unsigned fsync_flags;
+ int ret;
+
+ fsync_flags = READ_ONCE(sqe->fsync_flags);
+ if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+ return -EINVAL;
+
+ ret = io_prep_fsync(req, sqe);
+ if (ret)
+ return ret;
+
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+ end > 0 ? end : LLONG_MAX,
+ fsync_flags & IORING_FSYNC_DATASYNC);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = &req->poll;
+
+ spin_lock(&poll->head->lock);
+ WRITE_ONCE(poll->canceled, true);
+ if (!list_empty(&poll->wait.entry)) {
+ list_del_init(&poll->wait.entry);
+ queue_work(req->ctx->sqo_wq, &req->work);
+ }
+ spin_unlock(&poll->head->lock);
+
+ list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&ctx->cancel_list)) {
+ req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+ io_poll_remove_one(req);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *poll_req, *next;
+ int ret = -ENOENT;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->poll_events)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+ if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+ io_poll_remove_one(poll_req);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+{
+ io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+ io_fput(req);
+ io_free_req(req);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_poll_iocb *poll = &req->poll;
+ struct poll_table_struct pt = { ._key = poll->events };
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = 0;
+
+ if (!READ_ONCE(poll->canceled))
+ mask = vfs_poll(poll->file, &pt) & poll->events;
+
+ /*
+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
+ * synchronize with them. In the cancellation case the list_del_init
+ * itself is not actually needed, but harmless so we keep it in to
+ * avoid further branches in the fast path.
+ */
+ spin_lock_irq(&ctx->completion_lock);
+ if (!mask && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_poll_complete(req, mask);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+ wait);
+ struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = key_to_poll(key);
+
+ poll->woken = true;
+
+ /* for instances that support it check for an event match first: */
+ if (mask) {
+ unsigned long flags;
+
+ if (!(mask & poll->events))
+ return 0;
+
+ /* try to complete the iocb inline if we can: */
+ if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ list_del(&req->list);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ list_del_init(&poll->wait.entry);
+ io_poll_complete(req, mask);
+ return 1;
+ }
+ }
+
+ list_del_init(&poll->wait.entry);
+ queue_work(ctx->sqo_wq, &req->work);
+ return 1;
+}
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ if (unlikely(pt->req->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->req->poll.head = head;
+ add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_poll_iocb *poll = &req->poll;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_table ipt;
+ unsigned flags;
+ __poll_t mask;
+ u16 events;
+ int fd;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, io_poll_complete_work);
+ events = READ_ONCE(sqe->poll_events);
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ poll->file = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ poll->file = fget(fd);
+ }
+ if (unlikely(!poll->file))
+ return -EBADF;
+
+ poll->head = NULL;
+ poll->woken = false;
+ poll->canceled = false;
+
+ ipt.pt._qproc = io_poll_queue_proc;
+ ipt.pt._key = poll->events;
+ ipt.req = req;
+ ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ refcount_set(&req->refs, 2);
+
+ mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ if (unlikely(!poll->head)) {
+ /* we did not manage to set up a waitqueue, done */
+ goto out;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&poll->head->lock);
+ if (poll->woken) {
+ /* wake_up context handles the rest */
+ mask = 0;
+ ipt.error = 0;
+ } else if (mask || ipt.error) {
+ /* if we get an error or a mask we are done */
+ WARN_ON_ONCE(list_empty(&poll->wait.entry));
+ list_del_init(&poll->wait.entry);
+ } else {
+ /* actually waiting for an event */
+ list_add_tail(&req->list, &ctx->cancel_list);
+ }
+ spin_unlock(&poll->head->lock);
+ spin_unlock_irq(&ctx->completion_lock);
+
+out:
+ if (unlikely(ipt.error)) {
+ if (!(flags & IOSQE_FIXED_FILE))
+ fput(poll->file);
+ /*
+ * Drop one of our refs to this req, __io_submit_sqe() will
+ * drop the other one since we're returning an error.
+ */
+ io_free_req(req);
+ return ipt.error;
+ }
+
+ if (mask)
+ io_poll_complete(req, mask);
+ io_free_req(req);
+ return 0;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct sqe_submit *s, bool force_nonblock,
+ struct io_submit_state *state)
+{
+ ssize_t ret;
+ int opcode;
+
+ if (unlikely(s->index >= ctx->sq_entries))
+ return -EINVAL;
+ req->user_data = READ_ONCE(s->sqe->user_data);
+
+ opcode = READ_ONCE(s->sqe->opcode);
+ switch (opcode) {
+ case IORING_OP_NOP:
+ ret = io_nop(req, req->user_data);
+ break;
+ case IORING_OP_READV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITEV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_READ_FIXED:
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITE_FIXED:
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_FSYNC:
+ ret = io_fsync(req, s->sqe, force_nonblock);
+ break;
+ case IORING_OP_POLL_ADD:
+ ret = io_poll_add(req, s->sqe);
+ break;
+ case IORING_OP_POLL_REMOVE:
+ ret = io_poll_remove(req, s->sqe);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (req->error == -EAGAIN)
+ return -EAGAIN;
+
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (s->needs_lock)
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_req_issued(req);
+ if (s->needs_lock)
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe *sqe)
+{
+ switch (sqe->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ return &ctx->pending_async[READ];
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ return &ctx->pending_async[WRITE];
+ default:
+ return NULL;
+ }
+}
+
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+ u8 opcode = READ_ONCE(sqe->opcode);
+
+ return !(opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED);
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct mm_struct *cur_mm = NULL;
+ struct async_list *async_list;
+ LIST_HEAD(req_list);
+ mm_segment_t old_fs;
+ int ret;
+
+ async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
+restart:
+ do {
+ struct sqe_submit *s = &req->submit;
+ const struct io_uring_sqe *sqe = s->sqe;
+
+ /* Ensure we clear previously set forced non-block flag */
+ req->flags &= ~REQ_F_FORCE_NONBLOCK;
+ req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+ ret = 0;
+ if (io_sqe_needs_user(sqe) && !cur_mm) {
+ if (!mmget_not_zero(ctx->sqo_mm)) {
+ ret = -EFAULT;
+ } else {
+ cur_mm = ctx->sqo_mm;
+ use_mm(cur_mm);
+ old_fs = get_fs();
+ set_fs(USER_DS);
+ }
+ }
+
+ if (!ret) {
+ s->has_user = cur_mm != NULL;
+ s->needs_lock = true;
+ do {
+ ret = __io_submit_sqe(ctx, req, s, false, NULL);
+ /*
+ * We can get EAGAIN for polled IO even though
+ * we're forcing a sync submission from here,
+ * since we can't wait for request slots on the
+ * block side.
+ */
+ if (ret != -EAGAIN)
+ break;
+ cond_resched();
+ } while (1);
+ }
+ if (ret) {
+ io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ }
+
+ /* async context always use a copy of the sqe */
+ kfree(sqe);
+
+ if (!async_list)
+ break;
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list, struct io_kiocb,
+ list);
+ list_del(&req->list);
+ continue;
+ }
+ if (list_empty(&async_list->list))
+ break;
+
+ req = NULL;
+ spin_lock(&async_list->lock);
+ if (list_empty(&async_list->list)) {
+ spin_unlock(&async_list->lock);
+ break;
+ }
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ req = list_first_entry(&req_list, struct io_kiocb, list);
+ list_del(&req->list);
+ } while (req);
+
+ /*
+ * Rare case of racing with a submitter. If we find the count has
+ * dropped to zero AND we have pending work items, then restart
+ * the processing. This is a tiny race window.
+ */
+ if (async_list) {
+ ret = atomic_dec_return(&async_list->cnt);
+ while (!ret && !list_empty(&async_list->list)) {
+ spin_lock(&async_list->lock);
+ atomic_inc(&async_list->cnt);
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list,
+ struct io_kiocb, list);
+ list_del(&req->list);
+ goto restart;
+ }
+ ret = atomic_dec_return(&async_list->cnt);
+ }
+ }
+
+ if (cur_mm) {
+ set_fs(old_fs);
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+}
+
+/*
+ * See if we can piggy back onto previously submitted work, that is still
+ * running. We currently only allow this if the new request is sequential
+ * to the previous one we punted.
+ */
+static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
+{
+ bool ret = false;
+
+ if (!list)
+ return false;
+ if (!(req->flags & REQ_F_SEQ_PREV))
+ return false;
+ if (!atomic_read(&list->cnt))
+ return false;
+
+ ret = true;
+ spin_lock(&list->lock);
+ list_add_tail(&req->list, &list->list);
+ if (!atomic_read(&list->cnt)) {
+ list_del_init(&req->list);
+ ret = false;
+ }
+ spin_unlock(&list->lock);
+ return ret;
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+ ssize_t ret;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
+ return -EINVAL;
+
+ req = io_get_req(ctx, state);
+ if (unlikely(!req))
+ return -EAGAIN;
+
+ req->rw.ki_filp = NULL;
+
+ ret = __io_submit_sqe(ctx, req, s, true, state);
+ if (ret == -EAGAIN) {
+ struct io_uring_sqe *sqe_copy;
+
+ sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+ if (sqe_copy) {
+ struct async_list *list;
+
+ memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
+ s->sqe = sqe_copy;
+
+ memcpy(&req->submit, s, sizeof(*s));
+ list = io_async_list_from_sqe(ctx, s->sqe);
+ if (!io_add_to_prev_work(list, req)) {
+ if (list)
+ atomic_inc(&list->cnt);
+ INIT_WORK(&req->work, io_sq_wq_submit_work);
+ queue_work(ctx->sqo_wq, &req->work);
+ }
+ ret = 0;
+ }
+ }
+ if (ret)
+ io_free_req(req);
+
+ return ret;
+}
+
+/*
+ * Batched submission is done, ensure local IO is flushed out.
+ */
+static void io_submit_state_end(struct io_submit_state *state)
+{
+ blk_finish_plug(&state->plug);
+ io_file_put(state, NULL);
+ if (state->free_reqs)
+ kmem_cache_free_bulk(req_cachep, state->free_reqs,
+ &state->reqs[state->cur_req]);
+}
+
+/*
+ * Start submission side cache.
+ */
+static void io_submit_state_start(struct io_submit_state *state,
+ struct io_ring_ctx *ctx, unsigned max_ios)
+{
+ blk_start_plug(&state->plug);
+ state->free_reqs = 0;
+ state->file = NULL;
+ state->ios_left = max_ios;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+
+ if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+ /*
+ * Ensure any loads from the SQEs are done at this point,
+ * since once we write the new head, the application could
+ * write new data to them.
+ */
+ smp_store_release(&ring->r.head, ctx->cached_sq_head);
+
+ /*
+ * write side barrier of head update, app has read side. See
+ * comment at the top of this file
+ */
+ smp_wmb();
+ }
+}
+
+/*
+ * Undo last io_get_sqring()
+ */
+static void io_drop_sqring(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head--;
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+ unsigned head;
+
+ /*
+ * The cached sq head (or cq tail) serves two purposes:
+ *
+ * 1) allows us to batch the cost of updating the user visible
+ * head updates.
+ * 2) allows the kernel side to track the head on its own, even
+ * though the application is the one updating it.
+ */
+ head = ctx->cached_sq_head;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (head == READ_ONCE(ring->r.tail))
+ return false;
+
+ head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+ if (head < ctx->sq_entries) {
+ s->index = head;
+ s->sqe = &ctx->sq_sqes[head];
+ ctx->cached_sq_head++;
+ return true;
+ }
+
+ /* drop invalid entries */
+ ctx->cached_sq_head++;
+ ring->dropped++;
+ /* See comment at the top of this file */
+ smp_wmb();
+ return false;
+}
+
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+ unsigned int nr, bool has_user, bool mm_fault)
+{
+ struct io_submit_state state, *statep = NULL;
+ int ret, i, submitted = 0;
+
+ if (nr > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, nr);
+ statep = &state;
+ }
+
+ for (i = 0; i < nr; i++) {
+ if (unlikely(mm_fault)) {
+ ret = -EFAULT;
+ } else {
+ sqes[i].has_user = has_user;
+ sqes[i].needs_lock = true;
+ sqes[i].needs_fixed_file = true;
+ ret = io_submit_sqe(ctx, &sqes[i], statep);
+ }
+ if (!ret) {
+ submitted++;
+ continue;
+ }
+
+ io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+ }
+
+ if (statep)
+ io_submit_state_end(&state);
+
+ return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+ struct sqe_submit sqes[IO_IOPOLL_BATCH];
+ struct io_ring_ctx *ctx = data;
+ struct mm_struct *cur_mm = NULL;
+ mm_segment_t old_fs;
+ DEFINE_WAIT(wait);
+ unsigned inflight;
+ unsigned long timeout;
+
+ old_fs = get_fs();
+ set_fs(USER_DS);
+
+ timeout = inflight = 0;
+ while (!kthread_should_stop() && !ctx->sqo_stop) {
+ bool all_fixed, mm_fault = false;
+ int i;
+
+ if (inflight) {
+ unsigned nr_events = 0;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /*
+ * We disallow the app entering submit/complete
+ * with polling, but we still need to lock the
+ * ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_check(ctx, &nr_events, 0);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ /*
+ * Normal IO, just pretend everything completed.
+ * We don't have to poll completions for that.
+ */
+ nr_events = inflight;
+ }
+
+ inflight -= nr_events;
+ if (!inflight)
+ timeout = jiffies + ctx->sq_thread_idle;
+ }
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep.
+ */
+ if (inflight || !time_after(jiffies, timeout)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ cur_mm = NULL;
+ }
+
+ prepare_to_wait(&ctx->sqo_wait, &wait,
+ TASK_INTERRUPTIBLE);
+
+ /* Tell userspace we may need a wakeup call */
+ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ if (kthread_should_stop()) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ break;
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ schedule();
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ continue;
+ }
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ }
+
+ i = 0;
+ all_fixed = true;
+ do {
+ if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+ all_fixed = false;
+
+ i++;
+ if (i == ARRAY_SIZE(sqes))
+ break;
+ } while (io_get_sqring(ctx, &sqes[i]));
+
+ /* Unless all new commands are FIXED regions, grab mm */
+ if (!all_fixed && !cur_mm) {
+ mm_fault = !mmget_not_zero(ctx->sqo_mm);
+ if (!mm_fault) {
+ use_mm(ctx->sqo_mm);
+ cur_mm = ctx->sqo_mm;
+ }
+ }
+
+ inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+ mm_fault);
+
+ /* Commit SQ ring head once we've consumed all SQEs */
+ io_commit_sqring(ctx);
+ }
+
+ set_fs(old_fs);
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+ return 0;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+ struct io_submit_state state, *statep = NULL;
+ int i, ret = 0, submit = 0;
+
+ if (to_submit > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, to_submit);
+ statep = &state;
+ }
+
+ for (i = 0; i < to_submit; i++) {
+ struct sqe_submit s;
+
+ if (!io_get_sqring(ctx, &s))
+ break;
+
+ s.has_user = true;
+ s.needs_lock = false;
+ s.needs_fixed_file = false;
+
+ ret = io_submit_sqe(ctx, &s, statep);
+ if (ret) {
+ io_drop_sqring(ctx);
+ break;
+ }
+
+ submit++;
+ }
+ io_commit_sqring(ctx);
+
+ if (statep)
+ io_submit_state_end(statep);
+
+ return submit ? submit : ret;
+}
+
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+ return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+ const sigset_t __user *sig, size_t sigsz)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ sigset_t ksigmask, sigsaved;
+ DEFINE_WAIT(wait);
+ int ret;
+
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ return 0;
+
+ if (sig) {
+ ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
+
+ ret = 0;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ break;
+
+ schedule();
+
+ ret = -EINTR;
+ if (signal_pending(current))
+ break;
+ } while (1);
+
+ finish_wait(&ctx->wait, &wait);
+
+ if (sig)
+ restore_user_sigmask(sig, &sigsaved);
+
+ return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+}
+
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock) {
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+ kfree_skb(skb);
+ }
+#else
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+ if (!ctx->user_files)
+ return -ENXIO;
+
+ __io_sqe_files_unregister(ctx);
+ kfree(ctx->user_files);
+ ctx->user_files = NULL;
+ ctx->nr_user_files = 0;
+ return 0;
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+ if (ctx->sqo_thread) {
+ ctx->sqo_stop = 1;
+ mb();
+ kthread_stop(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ }
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+ io_sq_thread_stop(ctx);
+
+ if (ctx->sqo_wq) {
+ destroy_workqueue(ctx->sqo_wq);
+ ctx->sqo_wq = NULL;
+ }
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+ struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+
+ io_finish_async(ctx);
+ unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+ struct sock *sk = ctx->ring_sock->sk;
+ struct scm_fp_list *fpl;
+ struct sk_buff *skb;
+ int i;
+
+ if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ unsigned long inflight = ctx->user->unix_inflight + nr;
+
+ if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+ return -EMFILE;
+ }
+
+ fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+
+ skb = alloc_skb(0, GFP_KERNEL);
+ if (!skb) {
+ kfree(fpl);
+ return -ENOMEM;
+ }
+
+ skb->sk = sk;
+ skb->destructor = io_destruct_skb;
+
+ fpl->user = get_uid(ctx->user);
+ for (i = 0; i < nr; i++) {
+ fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+ unix_inflight(fpl->user, fpl->fp[i]);
+ }
+
+ fpl->max = fpl->count = nr;
+ UNIXCB(skb).fp = fpl;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ for (i = 0; i < nr; i++)
+ fput(fpl->fp[i]);
+
+ return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ unsigned left, total;
+ int ret = 0;
+
+ total = 0;
+ left = ctx->nr_user_files;
+ while (left) {
+ unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+ int ret;
+
+ ret = __io_sqe_files_scm(ctx, this_files, total);
+ if (ret)
+ break;
+ left -= this_files;
+ total += this_files;
+ }
+
+ if (!ret)
+ return 0;
+
+ while (total < ctx->nr_user_files) {
+ fput(ctx->user_files[total]);
+ total++;
+ }
+
+ return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ __s32 __user *fds = (__s32 __user *) arg;
+ int fd, ret = 0;
+ unsigned i;
+
+ if (ctx->user_files)
+ return -EBUSY;
+ if (!nr_args)
+ return -EINVAL;
+ if (nr_args > IORING_MAX_FIXED_FILES)
+ return -EMFILE;
+
+ ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+ if (!ctx->user_files)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ ret = -EFAULT;
+ if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+ break;
+
+ ctx->user_files[i] = fget(fd);
+
+ ret = -EBADF;
+ if (!ctx->user_files[i])
+ break;
+ /*
+ * Don't allow io_uring instances to be registered. If UNIX
+ * isn't enabled, then this causes a reference cycle and this
+ * instance can never get freed. If UNIX is enabled we'll
+ * handle it just fine, but there's still no point in allowing
+ * a ring fd as it doesn't support regular read/write anyway.
+ */
+ if (ctx->user_files[i]->f_op == &io_uring_fops) {
+ fput(ctx->user_files[i]);
+ break;
+ }
+ ctx->nr_user_files++;
+ ret = 0;
+ }
+
+ if (ret) {
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+
+ kfree(ctx->user_files);
+ ctx->nr_user_files = 0;
+ return ret;
+ }
+
+ ret = io_sqe_files_scm(ctx);
+ if (ret)
+ io_sqe_files_unregister(ctx);
+
+ return ret;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ int ret;
+
+ init_waitqueue_head(&ctx->sqo_wait);
+ mmgrab(current->mm);
+ ctx->sqo_mm = current->mm;
+
+ ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+ if (!ctx->sq_thread_idle)
+ ctx->sq_thread_idle = HZ;
+
+ ret = -EINVAL;
+ if (!cpu_possible(p->sq_thread_cpu))
+ goto err;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (p->flags & IORING_SETUP_SQ_AFF) {
+ int cpu;
+
+ cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+ ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+ ctx, cpu,
+ "io_uring-sq");
+ } else {
+ ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+ "io_uring-sq");
+ }
+ if (IS_ERR(ctx->sqo_thread)) {
+ ret = PTR_ERR(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ goto err;
+ }
+ wake_up_process(ctx->sqo_thread);
+ } else if (p->flags & IORING_SETUP_SQ_AFF) {
+ /* Can't have SQ_AFF without SQPOLL */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Do QD, or 2 * CPUS, whatever is smallest */
+ ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+ min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+ if (!ctx->sqo_wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return 0;
+err:
+ io_sq_thread_stop(ctx);
+ mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ unsigned long page_limit, cur_pages, new_pages;
+
+ /* Don't allow more pages than we can safely lock */
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+ struct page *page = virt_to_head_page(ptr);
+
+ if (put_page_testzero(page))
+ free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+ __GFP_NORETRY;
+
+ return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t bytes;
+
+ bytes = struct_size(sq_ring, array, sq_entries);
+ bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
+ bytes += struct_size(cq_ring, cqes, cq_entries);
+
+ return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+}
+
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+ int i, j;
+
+ if (!ctx->user_bufs)
+ return -ENXIO;
+
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+ for (j = 0; j < imu->nr_bvecs; j++)
+ put_page(imu->bvec[j].bv_page);
+
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, imu->nr_bvecs);
+ kfree(imu->bvec);
+ imu->nr_bvecs = 0;
+ }
+
+ kfree(ctx->user_bufs);
+ ctx->user_bufs = NULL;
+ ctx->nr_user_bufs = 0;
+ return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+ void __user *arg, unsigned index)
+{
+ struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat) {
+ struct compat_iovec __user *ciovs;
+ struct compat_iovec ciov;
+
+ ciovs = (struct compat_iovec __user *) arg;
+ if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+ return -EFAULT;
+
+ dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+ dst->iov_len = ciov.iov_len;
+ return 0;
+ }
+#endif
+ src = (struct iovec __user *) arg;
+ if (copy_from_user(dst, &src[index], sizeof(*dst)))
+ return -EFAULT;
+ return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct vm_area_struct **vmas = NULL;
+ struct page **pages = NULL;
+ int i, j, got_pages = 0;
+ int ret = -EINVAL;
+
+ if (ctx->user_bufs)
+ return -EBUSY;
+ if (!nr_args || nr_args > UIO_MAXIOV)
+ return -EINVAL;
+
+ ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+ GFP_KERNEL);
+ if (!ctx->user_bufs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ unsigned long off, start, end, ubuf;
+ int pret, nr_pages;
+ struct iovec iov;
+ size_t size;
+
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ ret = -EFAULT;
+ if (!iov.iov_base || !iov.iov_len)
+ goto err;
+
+ /* arbitrary limit, but we need something */
+ if (iov.iov_len > SZ_1G)
+ goto err;
+
+ ubuf = (unsigned long) iov.iov_base;
+ end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ if (ctx->account_mem) {
+ ret = io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ goto err;
+ }
+
+ ret = 0;
+ if (!pages || nr_pages > got_pages) {
+ kfree(vmas);
+ kfree(pages);
+ pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ vmas = kmalloc_array(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!pages || !vmas) {
+ ret = -ENOMEM;
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+ got_pages = nr_pages;
+ }
+
+ imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!imu->bvec) {
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ ret = 0;
+ down_read(&current->mm->mmap_sem);
+ pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (j = 0; j < nr_pages; j++) {
+ struct vm_area_struct *vma = vmas[j];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ up_read(&current->mm->mmap_sem);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0) {
+ for (j = 0; j < pret; j++)
+ put_page(pages[j]);
+ }
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov.iov_len;
+ for (j = 0; j < nr_pages; j++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[j].bv_page = pages[j];
+ imu->bvec[j].bv_len = vec_len;
+ imu->bvec[j].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov.iov_len;
+ imu->nr_bvecs = nr_pages;
+
+ ctx->nr_user_bufs++;
+ }
+ kfree(pages);
+ kfree(vmas);
+ return 0;
+err:
+ kfree(pages);
+ kfree(vmas);
+ io_sqe_buffer_unregister(ctx);
+ return ret;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+ io_finish_async(ctx);
+ if (ctx->sqo_mm)
+ mmdrop(ctx->sqo_mm);
+
+ io_iopoll_reap_events(ctx);
+ io_sqe_buffer_unregister(ctx);
+ io_sqe_files_unregister(ctx);
+
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock)
+ sock_release(ctx->ring_sock);
+#endif
+
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ io_mem_free(ctx->cq_ring);
+
+ percpu_ref_exit(&ctx->refs);
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+ free_uid(ctx->user);
+ kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &ctx->cq_wait, wait);
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+ mutex_lock(&ctx->uring_lock);
+ percpu_ref_kill(&ctx->refs);
+ mutex_unlock(&ctx->uring_lock);
+
+ io_poll_remove_all(ctx);
+ io_iopoll_reap_events(ctx);
+ wait_for_completion(&ctx->ctx_done);
+ io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ file->private_data = NULL;
+ io_ring_ctx_wait_and_kill(ctx);
+ return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long sz = vma->vm_end - vma->vm_start;
+ struct io_ring_ctx *ctx = file->private_data;
+ unsigned long pfn;
+ struct page *page;
+ void *ptr;
+
+ switch (offset) {
+ case IORING_OFF_SQ_RING:
+ ptr = ctx->sq_ring;
+ break;
+ case IORING_OFF_SQES:
+ ptr = ctx->sq_sqes;
+ break;
+ case IORING_OFF_CQ_RING:
+ ptr = ctx->cq_ring;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ page = virt_to_head_page(ptr);
+ if (sz > (PAGE_SIZE << compound_order(page)))
+ return -EINVAL;
+
+ pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+ return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+ u32, min_complete, u32, flags, const sigset_t __user *, sig,
+ size_t, sigsz)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ int submitted = 0;
+ struct fd f;
+
+ if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ret = -ENXIO;
+ ctx = f.file->private_data;
+ if (!percpu_ref_tryget(&ctx->refs))
+ goto out_fput;
+
+ /*
+ * For SQ polling, the thread will do all submissions and completions.
+ * Just return the requested submit count, and wake the thread if
+ * we were asked to.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (flags & IORING_ENTER_SQ_WAKEUP)
+ wake_up(&ctx->sqo_wait);
+ submitted = to_submit;
+ goto out_ctx;
+ }
+
+ ret = 0;
+ if (to_submit) {
+ to_submit = min(to_submit, ctx->sq_entries);
+
+ mutex_lock(&ctx->uring_lock);
+ submitted = io_ring_submit(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (submitted < 0)
+ goto out_ctx;
+ }
+ if (flags & IORING_ENTER_GETEVENTS) {
+ unsigned nr_events = 0;
+
+ min_complete = min(min_complete, ctx->cq_entries);
+
+ /*
+ * The application could have included the 'to_submit' count
+ * in how many events it wanted to wait for. If we failed to
+ * submit the desired count, we may need to adjust the number
+ * of events to poll/wait for.
+ */
+ if (submitted < to_submit)
+ min_complete = min_t(unsigned, submitted, min_complete);
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ mutex_lock(&ctx->uring_lock);
+ ret = io_iopoll_check(ctx, &nr_events, min_complete);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ }
+ }
+
+out_ctx:
+ io_ring_drop_ctx_refs(ctx, 1);
+out_fput:
+ fdput(f);
+ return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+ .release = io_uring_release,
+ .mmap = io_uring_mmap,
+ .poll = io_uring_poll,
+ .fasync = io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t size;
+
+ sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
+ if (!sq_ring)
+ return -ENOMEM;
+
+ ctx->sq_ring = sq_ring;
+ sq_ring->ring_mask = p->sq_entries - 1;
+ sq_ring->ring_entries = p->sq_entries;
+ ctx->sq_mask = sq_ring->ring_mask;
+ ctx->sq_entries = sq_ring->ring_entries;
+
+ size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ ctx->sq_sqes = io_mem_alloc(size);
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->sq_ring);
+ return -ENOMEM;
+ }
+
+ cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
+ if (!cq_ring) {
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ return -ENOMEM;
+ }
+
+ ctx->cq_ring = cq_ring;
+ cq_ring->ring_mask = p->cq_entries - 1;
+ cq_ring->ring_entries = p->cq_entries;
+ ctx->cq_mask = cq_ring->ring_mask;
+ ctx->cq_entries = cq_ring->ring_entries;
+ return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+ struct file *file;
+ int ret;
+
+#if defined(CONFIG_UNIX)
+ ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+ &ctx->ring_sock);
+ if (ret)
+ return ret;
+#endif
+
+ ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto err;
+
+ file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(file)) {
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
+ goto err;
+ }
+
+#if defined(CONFIG_UNIX)
+ ctx->ring_sock->file = file;
+ ctx->ring_sock->sk->sk_user_data = ctx;
+#endif
+ fd_install(ret, file);
+ return ret;
+err:
+#if defined(CONFIG_UNIX)
+ sock_release(ctx->ring_sock);
+ ctx->ring_sock = NULL;
+#endif
+ return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+ struct user_struct *user = NULL;
+ struct io_ring_ctx *ctx;
+ bool account_mem;
+ int ret;
+
+ if (!entries || entries > IORING_MAX_ENTRIES)
+ return -EINVAL;
+
+ /*
+ * Use twice as many entries for the CQ ring. It's possible for the
+ * application to drive a higher depth than the size of the SQ ring,
+ * since the sqes are only used at submission time. This allows for
+ * some flexibility in overcommitting a bit.
+ */
+ p->sq_entries = roundup_pow_of_two(entries);
+ p->cq_entries = 2 * p->sq_entries;
+
+ user = get_uid(current_user());
+ account_mem = !capable(CAP_IPC_LOCK);
+
+ if (account_mem) {
+ ret = io_account_mem(user,
+ ring_pages(p->sq_entries, p->cq_entries));
+ if (ret) {
+ free_uid(user);
+ return ret;
+ }
+ }
+
+ ctx = io_ring_ctx_alloc(p);
+ if (!ctx) {
+ if (account_mem)
+ io_unaccount_mem(user, ring_pages(p->sq_entries,
+ p->cq_entries));
+ free_uid(user);
+ return -ENOMEM;
+ }
+ ctx->compat = in_compat_syscall();
+ ctx->account_mem = account_mem;
+ ctx->user = user;
+
+ ret = io_allocate_scq_urings(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_sq_offload_start(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_uring_get_fd(ctx);
+ if (ret < 0)
+ goto err;
+
+ memset(&p->sq_off, 0, sizeof(p->sq_off));
+ p->sq_off.head = offsetof(struct io_sq_ring, r.head);
+ p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
+ p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
+ p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
+ p->sq_off.flags = offsetof(struct io_sq_ring, flags);
+ p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
+ p->sq_off.array = offsetof(struct io_sq_ring, array);
+
+ memset(&p->cq_off, 0, sizeof(p->cq_off));
+ p->cq_off.head = offsetof(struct io_cq_ring, r.head);
+ p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
+ p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
+ p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
+ p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
+ p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+ return ret;
+err:
+ io_ring_ctx_wait_and_kill(ctx);
+ return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+ struct io_uring_params p;
+ long ret;
+ int i;
+
+ if (copy_from_user(&p, params, sizeof(p)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+ if (p.resv[i])
+ return -EINVAL;
+ }
+
+ if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+ IORING_SETUP_SQ_AFF))
+ return -EINVAL;
+
+ ret = io_uring_create(entries, &p);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(params, &p, sizeof(p)))
+ return -EFAULT;
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+ struct io_uring_params __user *, params)
+{
+ return io_uring_setup(entries, params);
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+ void __user *arg, unsigned nr_args)
+{
+ int ret;
+
+ percpu_ref_kill(&ctx->refs);
+ wait_for_completion(&ctx->ctx_done);
+
+ switch (opcode) {
+ case IORING_REGISTER_BUFFERS:
+ ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_BUFFERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_buffer_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES:
+ ret = io_sqe_files_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_FILES:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_files_unregister(ctx);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ /* bring the ctx back to life */
+ reinit_completion(&ctx->ctx_done);
+ percpu_ref_reinit(&ctx->refs);
+ return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+ void __user *, arg, unsigned int, nr_args)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ struct fd f;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ctx = f.file->private_data;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_uring_register(ctx, opcode, arg, nr_args);
+ mutex_unlock(&ctx->uring_lock);
+out_fput:
+ fdput(f);
+ return ret;
+}
+
+static int __init io_uring_init(void)
+{
+ req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ return 0;
+};
+__initcall(io_uring_init);
diff --git a/fs/iomap.c b/fs/iomap.c
index 897c60215dd1..97cb9d486a7d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -274,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error);
bio_put(bio);
}
@@ -324,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
- if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -355,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
- __bio_add_page(ctx->bio, page, plen, poff);
+ bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -1463,6 +1464,28 @@ struct iomap_dio {
};
};
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+ struct request_queue *q = READ_ONCE(kiocb->private);
+
+ if (!q)
+ return 0;
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+ struct bio *bio)
+{
+ atomic_inc(&dio->ref);
+
+ if (dio->iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, dio->iocb);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+}
+
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
@@ -1568,14 +1591,15 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
}
-static blk_qc_t
+static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
{
@@ -1589,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- if (dio->iocb->ki_flags & IOCB_HIPRI)
- flags |= REQ_HIPRI;
-
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-
- atomic_inc(&dio->ref);
- return submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
}
static loff_t
@@ -1700,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio_set_pages_dirty(bio);
}
- if (dio->iocb->ki_flags & IOCB_HIPRI)
- bio->bi_opf |= REQ_HIPRI;
-
iov_iter_advance(dio->submit.iter, n);
dio->size += n;
@@ -1710,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
- atomic_inc(&dio->ref);
-
- dio->submit.last_queue = bdev_get_queue(iomap->bdev);
- dio->submit.cookie = submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
/*
@@ -1925,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+ WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three three
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 26f8d7e46462..02e0b79753e7 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
@@ -276,9 +276,22 @@ restart:
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
jbd2_log_start_commit(journal, tid);
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set, jbd2_update_log_tail() called by
+ * jbd2_journal_commit_transaction() may also take
+ * checkpoint_mutex. So we need to temporarily
+ * drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
- goto retry;
+ mutex_lock_io(&journal->j_checkpoint_mutex);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
}
if (!buffer_dirty(bh)) {
if (unlikely(buffer_write_io_error(bh)) && !result)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2eb55c3361a8..efd0ce9489ae 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
the last tag we set up. */
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
- jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
+ if (descriptor)
+ jbd2_descriptor_block_csum_set(journal,
+ descriptor);
+
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..382c030cc78b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
return cpu_to_be32(csum);
}
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
/*
* Helper function used to manage commit timeouts
*/
@@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
static int jbd2_write_superblock(journal_t *journal, int write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
@@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
- lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
- jbd2_superblock_csum_set(journal, sb);
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
+ lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
@@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
journal_superblock_t *sb = journal->j_superblock;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- read_lock(&journal->j_state_lock);
- /* Is it already empty? */
- if (sb->s_start == 0) {
- read_unlock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
+ if (sb->s_start == 0) { /* Is it already empty? */
+ unlock_buffer(journal->j_sb_buffer);
return;
}
+
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
- read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, write_op);
@@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
journal_superblock_t *sb = journal->j_superblock;
int errcode;
- read_lock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
errcode = journal->j_errno;
- read_unlock(&journal->j_state_lock);
if (errcode == -ESHUTDOWN)
errcode = 0;
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal)
}
}
- /* Check superblock checksum */
- if (!jbd2_superblock_csum_verify(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
- }
+ if (jbd2_journal_has_csum_v2or3(journal)) {
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ goto out;
+ }
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
+ /* Precompute checksum seed for all metadata */
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
+ }
set_buffer_verified(bh);
@@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
+ /* Load the checksum driver if necessary */
+ if ((journal->j_chksum_driver == NULL) &&
+ INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ journal->j_chksum_driver = NULL;
+ return 0;
+ }
+ /* Precompute checksum seed for all metadata */
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ }
+
+ lock_buffer(journal->j_sb_buffer);
+
/* If enabling v3 checksums, update superblock */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
- /* Load the checksum driver */
- if (journal->j_chksum_driver == NULL) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c",
- 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c "
- "driver.\n");
- journal->j_chksum_driver = NULL;
- return 0;
- }
-
- /* Precompute checksum seed for all metadata */
- journal->j_csum_seed = jbd2_chksum(journal, ~0,
- sb->s_uuid,
- sizeof(sb->s_uuid));
- }
}
/* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
+ unlock_buffer(journal->j_sb_buffer);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
err = jbd2_journal_skip_recovery(journal);
if (write) {
/* Lock to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cc35537232f2..f940d31c2adc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
- * Simply allocate and initialise a new transaction. Create it in
+ * Simply initialise a new transaction. Initialize it in
* RUNNING state and add it to the current journal (which should not
* have an existing running transaction: we only make a new transaction
* once we have started to commit the old one).
@@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
*
*/
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+ transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
@@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
transaction->t_requested = 0;
-
- return transaction;
}
/*
@@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
- JBUFFER_TRACE(jh, "entry");
if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
if (is_handle_aborted(handle))
return -EROFS;
- if (!buffer_jbd(bh)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (!buffer_jbd(bh))
+ return -EUCLEAN;
+
/*
* We don't grab jh reference here since the buffer must be part
* of the running transaction.
*/
jh = bh2jh(bh);
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* This and the following assertions are unreliable since we may see jh
* in inconsistent state unless we grab bh_state lock. But this is
@@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
}
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
-
jbd_lock_bh_state(bh);
if (jh->b_modified == 0) {
@@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_unfile_buffer(jh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
+ goto not_jbd;
}
}
spin_unlock(&journal->j_list_lock);
@@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
/* However, if the buffer is still owned by a prior
* (committing) transaction, we can't drop it yet... */
JBUFFER_TRACE(jh, "belongs to older transaction");
- /* ... but we CAN drop it from the new transaction if we
- * have also modified it since the original commit. */
+ /* ... but we CAN drop it from the new transaction through
+ * marking the buffer as freed and set j_next_transaction to
+ * the new transaction, so that not only the commit code
+ * knows it should clear dirty bits when it is done with the
+ * buffer, but also the buffer can be checkpointed only
+ * after the new transaction commits. */
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction == transaction);
+ set_buffer_freed(bh);
+
+ if (!jh->b_next_transaction) {
spin_lock(&journal->j_list_lock);
- jh->b_next_transaction = NULL;
+ jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
+ } else {
+ J_ASSERT(jh->b_next_transaction == transaction);
/*
* only drop a reference if this transaction modified
@@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (was_modified)
drop_reserve = 1;
}
+ } else {
+ /*
+ * Finally, if the buffer is not belongs to any
+ * transaction, we can just drop it now if it has no
+ * checkpoint.
+ */
+ spin_lock(&journal->j_list_lock);
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "belongs to none transaction");
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * Otherwise, if the buffer has been written to disk,
+ * it is safe to remove the checkpoint and drop it.
+ */
+ if (!buffer_dirty(bh)) {
+ __jbd2_journal_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * The buffer is still not written to disk, we should
+ * attach this buffer to current transaction so that the
+ * buffer can be checkpointed only after the current
+ * transaction commits.
+ */
+ clear_buffer_dirty(bh);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ spin_unlock(&journal->j_list_lock);
}
-not_jbd:
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
@@ -1636,6 +1670,11 @@ drop:
handle->h_buffer_credits++;
}
return err;
+
+not_jbd:
+ jbd_unlock_bh_state(bh);
+ __bforget(bh);
+ goto drop;
}
/**
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c18192..b84d635567d3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -536,8 +536,8 @@ void kernfs_put(struct kernfs_node *kn)
security_release_secctx(kn->iattr->ia_secdata,
kn->iattr->ia_secdata_len);
simple_xattrs_free(&kn->iattr->xattrs);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
- kfree(kn->iattr);
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, kn->id.ino);
spin_unlock(&kernfs_idr_lock);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f8d5021a652e..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 80cebcd94c90..0c1fd945ce42 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -42,7 +42,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
if (kn->iattr)
goto out_unlock;
- kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+ kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
if (!kn->iattr)
goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b114bb08..0b7d197a904c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -17,6 +17,7 @@
#include <linux/xattr.h>
#include <linux/kernfs.h>
+#include <linux/fs_context.h>
struct kernfs_iattrs {
struct iattr ia_iattr;
@@ -78,7 +79,7 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
}
extern const struct super_operations kernfs_sops;
-extern struct kmem_cache *kernfs_node_cache;
+extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index fdf527b6d79c..9a4646eecb71 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,17 +20,7 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache;
-
-static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
-{
- struct kernfs_root *root = kernfs_info(sb)->root;
- struct kernfs_syscall_ops *scops = root->syscall_ops;
-
- if (scops && scops->remount_fs)
- return scops->remount_fs(root, flags, data);
- return 0;
-}
+struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
@@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = {
.drop_inode = generic_delete_inode,
.evict_inode = kernfs_evict_inode,
- .remount_fs = kernfs_sop_remount_fs,
.show_options = kernfs_sop_show_options,
.show_path = kernfs_sop_show_path,
};
@@ -196,8 +185,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
return dentry;
knparent = find_next_ancestor(kn, NULL);
- if (WARN_ON(!knparent))
+ if (WARN_ON(!knparent)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
do {
struct dentry *dtmp;
@@ -206,8 +197,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
if (kn == knparent)
return dentry;
kntmp = find_next_ancestor(kn, knparent);
- if (WARN_ON(!kntmp))
+ if (WARN_ON(!kntmp)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
@@ -218,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
} while (true);
}
-static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
struct kernfs_super_info *info = kernfs_info(sb);
struct inode *inode;
@@ -229,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- sb->s_magic = magic;
+ sb->s_magic = kfc->magic;
sb->s_op = &kernfs_sops;
sb->s_xattr = kernfs_xattr_handlers;
if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
@@ -259,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
return 0;
}
-static int kernfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
{
struct kernfs_super_info *sb_info = kernfs_info(sb);
- struct kernfs_super_info *info = data;
+ struct kernfs_super_info *info = fc->s_fs_info;
return sb_info->root == info->root && sb_info->ns == info->ns;
}
-static int kernfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
{
- int error;
- error = set_anon_super(sb, data);
- if (!error)
- sb->s_fs_info = data;
- return error;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ kfc->ns_tag = NULL;
+ return set_anon_super_fc(sb, fc);
}
/**
@@ -290,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb)
}
/**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @magic: file system specific magic number
- * @new_sb_created: tell the caller if we allocated a new superblock
- * @ns: optional namespace tag of the mount
+ * kernfs_get_tree - kernfs filesystem access/retrieval helper
+ * @fc: The filesystem context.
*
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
+ * This is to be called from each kernfs user's fs_context->ops->get_tree()
+ * implementation, which should set the specified ->@fs_type and ->@flags, and
+ * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
+ * respectively.
*/
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
- struct kernfs_root *root, unsigned long magic,
- bool *new_sb_created, const void *ns)
+int kernfs_get_tree(struct fs_context *fc)
{
+ struct kernfs_fs_context *kfc = fc->fs_private;
struct super_block *sb;
struct kernfs_super_info *info;
int error;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- info->root = root;
- info->ns = ns;
+ info->root = kfc->root;
+ info->ns = kfc->ns_tag;
INIT_LIST_HEAD(&info->node);
- sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
- &init_user_ns, info);
- if (IS_ERR(sb) || sb->s_fs_info != info)
- kfree(info);
+ fc->s_fs_info = info;
+ sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- if (new_sb_created)
- *new_sb_created = !sb->s_root;
+ return PTR_ERR(sb);
if (!sb->s_root) {
struct kernfs_super_info *info = kernfs_info(sb);
- error = kernfs_fill_super(sb, magic);
+ kfc->new_sb_created = true;
+
+ error = kernfs_fill_super(sb, kfc);
if (error) {
deactivate_locked_super(sb);
- return ERR_PTR(error);
+ return error;
}
sb->s_flags |= SB_ACTIVE;
mutex_lock(&kernfs_mutex);
- list_add(&info->node, &root->supers);
+ list_add(&info->node, &info->root->supers);
mutex_unlock(&kernfs_mutex);
}
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+
+void kernfs_free_fs_context(struct fs_context *fc)
+{
+ /* Note that we don't deal with kfc->ns_tag here. */
+ kfree(fc->s_fs_info);
+ fc->s_fs_info = NULL;
}
/**
@@ -373,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
-/**
- * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
- * @kernfs_root: the kernfs_root in question
- * @ns: the namespace tag
- *
- * Pin the superblock so the superblock won't be destroyed in subsequent
- * operations. This can be used to block ->kill_sb() which may be useful
- * for kernfs users which dynamically manage superblocks.
- *
- * Returns NULL if there's no superblock associated to this kernfs_root, or
- * -EINVAL if the superblock is being freed.
- */
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
-{
- struct kernfs_super_info *info;
- struct super_block *sb = NULL;
-
- mutex_lock(&kernfs_mutex);
- list_for_each_entry(info, &root->supers, node) {
- if (info->ns == ns) {
- sb = info->sb;
- if (!atomic_inc_not_zero(&info->sb->s_active))
- sb = ERR_PTR(-EINVAL);
- break;
- }
- }
- mutex_unlock(&kernfs_mutex);
- return sb;
-}
-
void __init kernfs_init(void)
{
@@ -417,4 +376,9 @@ void __init kernfs_init(void)
0,
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
NULL);
+
+ /* Creates slab cache for kernfs inode attributes */
+ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
+ sizeof(struct kernfs_iattrs),
+ 0, SLAB_PANIC, NULL);
}
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 214a2fa1f1e3..7df6324ccb8a 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv4 basic data types
*
* Basic NLMv4 data types are defined in Appendix II, section 6.1.4
@@ -176,7 +165,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -236,7 +224,6 @@ out_bad_xdr:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 747b9c8c940a..4df62f635529 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv3 basic data types
*
* Basic NLMv3 data types are not defined in an IETF standards
@@ -173,7 +162,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -231,7 +219,6 @@ out_enum:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/locks.c b/fs/locks.c
index ff6af2c32601..eaa1cfaf73b0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1058,7 +1058,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
return -ENOMEM;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -1100,7 +1100,7 @@ find_conflict:
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -1138,7 +1138,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1312,7 +1312,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1584,7 +1584,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1636,13 +1636,13 @@ restart:
locks_insert_block(fl, new_fl, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_blocker, break_time);
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
@@ -1659,7 +1659,7 @@ restart:
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1729,7 +1729,7 @@ int fcntl_getlease(struct file *filp)
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
@@ -1739,7 +1739,7 @@ int fcntl_getlease(struct file *filp)
break;
}
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
@@ -1813,7 +1813,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1884,7 +1884,7 @@ out_setup:
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
@@ -1907,7 +1907,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1920,7 +1920,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -2643,13 +2643,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
if (list_empty(&ctx->flc_lease))
return;
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
if (filp == fl->fl_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..6250de544760 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry)
return __is_local_mountpoint(dentry);
}
+
+static inline bool is_anon_ns(struct mnt_namespace *ns)
+{
+ return ns->seq == 0;
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index c820dc9bebab..3f19da75178b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status));
diff --git a/fs/namei.c b/fs/namei.c
index 914178cdbe94..dede0147b3f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,7 +39,6 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
-#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
- BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -2333,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
return err;
}
-static int filename_lookup(int dfd, struct filename *name, unsigned flags,
- struct path *path, struct path *root)
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
{
int retval;
struct nameidata nd;
@@ -2720,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
if (unlikely(error == -ESTALE))
error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(name, path->dentry, 0);
+ audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
restore_nameidata();
putname(name);
return error;
@@ -3462,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
+ ima_post_create_tmpfile(inode);
return child;
out_err:
diff --git a/fs/namespace.c b/fs/namespace.c
index a677b59efd74..c9cab307fa77 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
+#include <linux/fs_context.h>
#include "pnode.h"
#include "internal.h"
@@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p)
return p;
}
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_create_mount - Create a mount for a configured superblock
+ * @fc: The configuration context with the superblock attached
+ *
+ * Create a mount to an already configured superblock. If necessary, the
+ * caller should invoke vfs_get_tree() before calling this.
+ *
+ * Note that this does not attach the mount to anything.
+ */
+struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct dentry *root;
- if (!type)
- return ERR_PTR(-ENODEV);
+ if (!fc->root)
+ return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(name);
+ mnt = alloc_vfsmnt(fc->source ?: "none");
if (!mnt)
return ERR_PTR(-ENOMEM);
- if (flags & SB_KERNMOUNT)
+ if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
- if (IS_ERR(root)) {
- mnt_free_id(mnt);
- free_vfsmnt(mnt);
- return ERR_CAST(root);
- }
+ atomic_inc(&fc->root->d_sb->s_active);
+ mnt->mnt.mnt_sb = fc->root->d_sb;
+ mnt->mnt.mnt_root = dget(fc->root);
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+ mnt->mnt_parent = mnt;
- mnt->mnt.mnt_root = root;
- mnt->mnt.mnt_sb = root->d_sb;
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+ list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
return &mnt->mnt;
}
+EXPORT_SYMBOL(vfs_create_mount);
+
+struct vfsmount *fc_mount(struct fs_context *fc)
+{
+ int err = vfs_get_tree(fc);
+ if (!err) {
+ up_write(&fc->root->d_sb->s_umount);
+ return vfs_create_mount(fc);
+ }
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fc_mount);
+
+struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+ int flags, const char *name,
+ void *data)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret = 0;
+
+ if (!type)
+ return ERR_PTR(-EINVAL);
+
+ fc = fs_context_for_mount(type, flags);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ if (name)
+ ret = vfs_parse_fs_string(fc, "source",
+ name, strlen(name));
+ if (!ret)
+ ret = parse_monolithic_mount_data(fc, data);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
+}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
struct vfsmount *
@@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
- /* Don't allow unprivileged users to change mount flags */
- if (flag & CL_UNPRIVILEGED) {
- mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
-
- if (mnt->mnt.mnt_flags & MNT_READONLY)
- mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
-
- if (mnt->mnt.mnt_flags & MNT_NODEV)
- mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
-
- if (mnt->mnt.mnt_flags & MNT_NOSUID)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
-
- if (mnt->mnt.mnt_flags & MNT_NOEXEC)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
- }
-
- /* Don't allow unprivileged users to reveal what is under a mount */
- if ((flag & CL_UNPRIVILEGED) &&
- (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
- mnt->mnt.mnt_flags |= MNT_LOCKED;
atomic_inc(&sb->s_active);
mnt->mnt.mnt_sb = sb;
@@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
static void shrink_submounts(struct mount *mnt);
+static int do_umount_root(struct super_block *sb)
+{
+ int ret = 0;
+
+ down_write(&sb->s_umount);
+ if (!sb_rdonly(sb)) {
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
+ SB_RDONLY);
+ if (IS_ERR(fc)) {
+ ret = PTR_ERR(fc);
+ } else {
+ ret = parse_monolithic_mount_data(fc, NULL);
+ if (!ret)
+ ret = reconfigure_super(fc);
+ put_fs_context(fc);
+ }
+ }
+ up_write(&sb->s_umount);
+ return ret;
+}
+
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
@@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags)
*/
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
- down_write(&sb->s_umount);
- if (!sb_rdonly(sb))
- retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
- up_write(&sb->s_umount);
- return retval;
+ return do_umount_root(sb);
}
namespace_lock();
@@ -1640,6 +1682,8 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ lookup_flags |= LOOKUP_NO_EVAL;
+
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
@@ -1837,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
return 0;
}
+static void lock_mnt_tree(struct mount *mnt)
+{
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ int flags = p->mnt.mnt_flags;
+ /* Don't allow unprivileged users to change mount flags */
+ flags |= MNT_LOCK_ATIME;
+
+ if (flags & MNT_READONLY)
+ flags |= MNT_LOCK_READONLY;
+
+ if (flags & MNT_NODEV)
+ flags |= MNT_LOCK_NODEV;
+
+ if (flags & MNT_NOSUID)
+ flags |= MNT_LOCK_NOSUID;
+
+ if (flags & MNT_NOEXEC)
+ flags |= MNT_LOCK_NOEXEC;
+ /* Don't allow unprivileged users to reveal what is under a mount */
+ if (list_empty(&p->mnt_expire))
+ flags |= MNT_LOCKED;
+ p->mnt.mnt_flags = flags;
+ }
+}
+
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
struct mount *p;
@@ -1954,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
struct mountpoint *dest_mp,
struct path *parent_path)
{
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
@@ -2004,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
if (q)
mnt_change_mountpoint(child, smp, q);
+ /* Notice when we are propagating across user namespaces */
+ if (child->mnt_parent->mnt_ns->user_ns != user_ns)
+ lock_mnt_tree(child);
commit_tree(child);
}
put_mountpoint(smp);
@@ -2311,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
int err;
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
- void *sec_opts = NULL;
+ struct fs_context *fc;
if (!check_mnt(mnt))
return -EINVAL;
@@ -2322,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
- if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) {
- err = security_sb_eat_lsm_opts(data, &sec_opts);
- if (err)
- return err;
- }
- err = security_sb_remount(sb, sec_opts);
- security_free_mnt_opts(&sec_opts);
- if (err)
- return err;
+ fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
- down_write(&sb->s_umount);
- err = -EPERM;
- if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- err = do_remount_sb(sb, sb_flags, data, 0);
- if (!err)
- set_mount_attributes(mnt, mnt_flags);
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err) {
+ down_write(&sb->s_umount);
+ err = -EPERM;
+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ err = reconfigure_super(fc);
+ if (!err)
+ set_mount_attributes(mnt, mnt_flags);
+ }
+ up_write(&sb->s_umount);
}
- up_write(&sb->s_umount);
+ put_fs_context(fc);
return err;
}
@@ -2423,29 +2496,6 @@ out:
return err;
}
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
- int err;
- const char *subtype = strchr(fstype, '.');
- if (subtype) {
- subtype++;
- err = -EINVAL;
- if (!subtype[0])
- goto err;
- } else
- subtype = "";
-
- mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
- err = -ENOMEM;
- if (!mnt->mnt_sb->s_subtype)
- goto err;
- return mnt;
-
- err:
- mntput(mnt);
- return ERR_PTR(err);
-}
-
/*
* add a mount into a namespace's mount tree
*/
@@ -2490,7 +2540,39 @@ unlock:
return err;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
+
+/*
+ * Create a new mount using a superblock configuration and request it
+ * be added to the namespace tree.
+ */
+static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+ unsigned int mnt_flags)
+{
+ struct vfsmount *mnt;
+ struct super_block *sb = fc->root->d_sb;
+ int error;
+
+ error = security_sb_kern_mount(sb);
+ if (!error && mount_too_revealing(sb, &mnt_flags))
+ error = -EPERM;
+
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
+ }
+
+ up_write(&sb->s_umount);
+
+ mnt = vfs_create_mount(fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ if (error < 0)
+ mntput(mnt);
+ return error;
+}
/*
* create a new mount for userspace and request it to be added into the
@@ -2500,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct vfsmount *mnt;
- int err;
+ struct fs_context *fc;
+ const char *subtype = NULL;
+ int err = 0;
if (!fstype)
return -EINVAL;
@@ -2510,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (!type)
return -ENODEV;
- mnt = vfs_kern_mount(type, sb_flags, name, data);
- if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
- !mnt->mnt_sb->s_subtype)
- mnt = fs_set_subtype(mnt, fstype);
+ if (type->fs_flags & FS_HAS_SUBTYPE) {
+ subtype = strchr(fstype, '.');
+ if (subtype) {
+ subtype++;
+ if (!*subtype) {
+ put_filesystem(type);
+ return -EINVAL;
+ }
+ } else {
+ subtype = "";
+ }
+ }
+ fc = fs_context_for_mount(type, sb_flags);
put_filesystem(type);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- if (mount_too_revealing(mnt, &mnt_flags)) {
- mntput(mnt);
- return -EPERM;
- }
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (subtype)
+ err = vfs_parse_fs_string(fc, "subtype",
+ subtype, strlen(subtype));
+ if (!err && name)
+ err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+ if (!err)
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err)
+ err = vfs_get_tree(fc);
+ if (!err)
+ err = do_new_mount_fc(fc, path, mnt_flags);
- err = do_add_mount(real_mount(mnt), path, mnt_flags);
- if (err)
- mntput(mnt);
+ put_fs_context(fc);
return err;
}
@@ -2698,7 +2795,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
if (!access_ok(from, n))
return n;
- current->kernel_uaccess_faults_ok++;
while (n) {
if (__get_user(c, f)) {
memset(t, 0, n);
@@ -2708,7 +2804,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
f++;
n--;
}
- current->kernel_uaccess_faults_ok--;
return n;
}
@@ -2746,7 +2841,7 @@ void *copy_mount_options(const void __user * data)
char *copy_mount_string(const void __user *data)
{
- return data ? strndup_user(data, PAGE_SIZE) : NULL;
+ return data ? strndup_user(data, PATH_MAX) : NULL;
}
/*
@@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
- ns_free_inum(&ns->ns);
+ if (!is_anon_ns(ns))
+ ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
@@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
*/
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
struct ucounts *ucounts;
@@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
+ if (!anon) {
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
+ }
}
new_ns->ns.ops = &mntns_operations;
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ if (!anon)
+ new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
atomic_set(&new_ns->count, 1);
- new_ns->root = NULL;
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
- new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
- new_ns->mounts = 0;
- new_ns->pending_mounts = 0;
return new_ns;
}
@@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
old = ns->root;
- new_ns = alloc_mnt_ns(user_ns);
+ new_ns = alloc_mnt_ns(user_ns, false);
if (IS_ERR(new_ns))
return new_ns;
@@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
- copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
+ copy_flags |= CL_SHARED_TO_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
+ if (user_ns != ns->user_ns) {
+ lock_mount_hash();
+ lock_mnt_tree(new);
+ unlock_mount_hash();
+ }
new_ns->root = new;
list_add_tail(&new_ns->list, &new->mnt_list);
@@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
return new_ns;
}
-/**
- * create_mnt_ns - creates a private namespace and adds a root filesystem
- * @mnt: pointer to the new root filesystem mountpoint
- */
-static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
-{
- struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
- if (!IS_ERR(new_ns)) {
- struct mount *mnt = real_mount(m);
- mnt->mnt_ns = new_ns;
- new_ns->root = mnt;
- new_ns->mounts++;
- list_add(&mnt->mnt_list, &new_ns->list);
- } else {
- mntput(m);
- }
- return new_ns;
-}
-
-struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
+ struct mount *mnt = real_mount(m);
struct mnt_namespace *ns;
struct super_block *s;
struct path path;
int err;
- ns = create_mnt_ns(mnt);
- if (IS_ERR(ns))
+ ns = alloc_mnt_ns(&init_user_ns, true);
+ if (IS_ERR(ns)) {
+ mntput(m);
return ERR_CAST(ns);
+ }
+ mnt->mnt_ns = ns;
+ ns->root = mnt;
+ ns->mounts++;
+ list_add(&mnt->mnt_list, &ns->list);
- err = vfs_path_lookup(mnt->mnt_root, mnt,
+ err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
put_mnt_ns(ns);
@@ -3228,6 +3316,7 @@ out0:
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
+ struct mount *m;
struct mnt_namespace *ns;
struct path root;
struct file_system_type *type;
@@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void)
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = create_mnt_ns(mnt);
+ ns = alloc_mnt_ns(&init_user_ns, false);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
-
+ m = real_mount(mnt);
+ m->mnt_ns = ns;
+ ns->root = m;
+ ns->mounts = 1;
+ list_add(&m->mnt_list, &ns->list);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
@@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
free_mnt_ns(ns);
}
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+struct vfsmount *kern_mount(struct file_system_type *type)
{
struct vfsmount *mnt;
- mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (!IS_ERR(mnt)) {
/*
* it is a longterm mount, don't release mnt until
@@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
}
return mnt;
}
-EXPORT_SYMBOL_GPL(kern_mount_data);
+EXPORT_SYMBOL_GPL(kern_mount);
void kern_unmount(struct vfsmount *mnt)
{
@@ -3352,7 +3445,8 @@ bool current_chrooted(void)
return chrooted;
}
-static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+static bool mnt_already_visible(struct mnt_namespace *ns,
+ const struct super_block *sb,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
@@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
+ if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
@@ -3415,7 +3509,7 @@ found:
return visible;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
@@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return false;
/* Can this filesystem be too revealing? */
- s_iflags = mnt->mnt_sb->s_iflags;
+ s_iflags = sb->s_iflags;
if (!(s_iflags & SB_I_USERNS_VISIBLE))
return false;
@@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return true;
}
- return !mnt_already_visible(ns, mnt, new_mnt_flags);
+ return !mnt_already_visible(ns, sb, new_mnt_flags);
}
bool mnt_may_suid(struct vfsmount *mnt)
@@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
+ if (is_anon_ns(mnt_ns))
+ return -EINVAL;
+
if (fs->users != 1)
return -EINVAL;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a87a56273407..06233bfa6d73 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
return xdr_ressize_check(rqstp, p);
}
-static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
-{
- __be32 *p;
-
- p = xdr_inline_decode(xdr, nbytes);
- if (unlikely(p == NULL))
- printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
- return p;
-}
-
static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
const char **str, size_t maxlen)
{
@@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
fh->size = ntohl(*p);
if (fh->size > NFS4_FHSIZE)
return htonl(NFS4ERR_BADHANDLE);
- p = read_buf(xdr, fh->size);
+ p = xdr_inline_decode(xdr, fh->size);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(&fh->data[0], p, fh->size);
@@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
__be32 *p;
unsigned int attrlen;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
attrlen = ntohl(*p);
- p = read_buf(xdr, attrlen << 2);
+ p = xdr_inline_decode(xdr, attrlen << 2);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
if (likely(attrlen > 0))
@@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
- p = read_buf(xdr, NFS4_STATEID_SIZE);
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(stateid->data, p, NFS4_STATEID_SIZE);
@@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
@@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE_HDR);
*op = ntohl(*p);
@@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp,
status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
args->truncate = ntohl(*p);
@@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- p = read_buf(xdr, 4 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
return decode_layout_stateid(xdr, &args->cbl_stateid);
} else if (args->cbl_recall_type == RETURN_FSID) {
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
@@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
args->ndevs = 0;
/* Num of device notifications */
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto out;
@@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
for (i = 0; i < n; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+ NFS4_DEVICEID4_SIZE);
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
{
__be32 *p;
- p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
goto out;
status = htonl(NFS4ERR_RESOURCE);
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
rc_list->rcl_nrefcalls = ntohl(*p++);
if (rc_list->rcl_nrefcalls) {
- p = read_buf(xdr,
+ p = xdr_inline_decode(xdr,
rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
@@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
if (status)
return status;
- p = read_buf(xdr, 5 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->craa_objs_to_keep = ntohl(*p++);
@@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
struct cb_recallslotargs *args = argp;
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->crsa_target_highest_slotid = ntohl(*p++);
@@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg
__be32 *p;
unsigned int len;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
len = be32_to_cpu(*p);
- p = read_buf(xdr, len);
+ p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr,
__be32 *p;
/* skip the always zero field */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
p++;
@@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp,
return status;
/* decode status */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
args->error = ntohl(*p++);
@@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
};
unsigned int nops = 0;
- xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+ xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+ rqstp->rq_arg.head[0].iov_base, NULL);
p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
- xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+ xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
if (status == htonl(NFS4ERR_RESOURCE))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 885363ca8569..2f6b447cdd82 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
spin_lock(&delegation->lock);
if (delegation->inode != NULL)
inode = igrab(delegation->inode);
+ if (!inode)
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
spin_unlock(&delegation->lock);
return inode;
}
@@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
/**
* nfs_super_return_all_delegations - return delegations for one superblock
- * @sb: sb to process
+ * @server: pointer to nfs_server to process
*
*/
void nfs_server_return_all_delegations(struct nfs_server *server)
@@ -944,10 +946,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
@@ -1053,10 +1056,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index dcbf3394ba0e..35b4b02c1ae0 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,6 +34,7 @@ enum {
NFS_DELEGATION_RETURNING,
NFS_DELEGATION_REVOKED,
NFS_DELEGATION_TEST_EXPIRED,
+ NFS_DELEGATION_INODE_FREEING,
};
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6bf4471850c8..a71d0b42d160 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -139,12 +139,19 @@ struct nfs_cache_array {
struct nfs_cache_array_entry array[0];
};
+struct readdirvec {
+ unsigned long nr;
+ unsigned long index;
+ struct page *pages[NFS_MAX_READDIR_RAPAGES];
+};
+
typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
struct dir_context *ctx;
unsigned long page_index;
+ struct readdirvec pvec;
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
@@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct nfs_cache_array *array;
unsigned int count = 0;
int status;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+
+ desc->pvec.index = desc->page_index;
+ desc->pvec.nr = 0;
scratch = alloc_page(GFP_KERNEL);
if (scratch == NULL)
@@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (desc->plus)
nfs_prime_dcache(file_dentry(desc->file), entry);
- status = nfs_readdir_add_to_array(entry, page);
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ if (status == -ENOSPC) {
+ desc->pvec.nr++;
+ if (desc->pvec.nr == max_rapages)
+ break;
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ }
if (status != 0)
break;
} while (!entry->eof);
+ /*
+ * page and desc->pvec.pages[0] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[0]);
+
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = kmap(page);
+ array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]);
array->eof_index = array->size;
status = 0;
- kunmap(page);
+ kunmap_atomic(array);
}
put_page(scratch);
+
+ /*
+ * desc->pvec.nr > 0 means at least one page was completely filled,
+ * we should return -ENOSPC. Otherwise function
+ * nfs_readdir_xdr_to_array will enter infinite loop.
+ */
+ if (desc->pvec.nr > 0)
+ return -ENOSPC;
return status;
}
@@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
}
/*
- * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_pagearray
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
*/
static
int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
@@ -595,6 +626,24 @@ out_freepages:
return -ENOMEM;
}
+/*
+ * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure.
+ */
+static
+void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+ int index;
+
+ for (index = 0; index < max_rapages; index++) {
+ array = kmap_atomic(desc->pvec.pages[index]);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+ kunmap_atomic(array);
+ }
+}
+
static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
@@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
+ /*
+ * This means we hit readdir rdpages miss, the preallocated rdpages
+ * are useless, the preallocate rdpages should be reinitialized.
+ */
+ nfs_readdir_rapages_init(desc);
+
entry.prev_cookie = 0;
entry.cookie = desc->last_cookie;
entry.eof = 0;
@@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
struct inode *inode = file_inode(desc->file);
int ret;
- ret = nfs_readdir_xdr_to_array(desc, page, inode);
- if (ret < 0)
- goto error;
+ /*
+ * If desc->page_index in range desc->pvec.index and
+ * desc->pvec.index + desc->pvec.nr, we get readdir cache hit.
+ */
+ if (desc->page_index >= desc->pvec.index &&
+ desc->page_index < (desc->pvec.index + desc->pvec.nr)) {
+ /*
+ * page and desc->pvec.pages[x] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]);
+ ret = 0;
+ } else {
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
+ }
+
SetPageUptodate(page);
if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
@@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
int res = 0;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
file, (long long)ctx->pos);
@@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx);
+ res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
+ if (res < 0)
+ return -ENOMEM;
+
+ nfs_readdir_rapages_init(desc);
+
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
} while (!desc->eof);
out:
+ nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
if (res > 0)
res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
@@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
/**
* nfs_force_lookup_revalidate - Mark the directory as having changed
- * @dir - pointer to directory inode
+ * @dir: pointer to directory inode
*
* This forces the revalidation code in nfs_lookup_revalidate() to do a
* full lookup on all child dentries of 'dir' whenever a change occurs
@@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);
full_reval:
return nfs_do_lookup_revalidate(dir, dentry, flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 33824a0a57bf..0fd811ac08b5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -428,7 +428,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
@@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
@@ -821,7 +820,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 29553fdba8af..4899b85f9b3c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release);
/**
* nfs_revalidate_size - Revalidate the file size
- * @inode - pointer to inode struct
- * @file - pointer to struct file
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
*
* Revalidates the file length. This is basically a wrapper around
* nfs_revalidate_inode() that takes into account the fact that we may
@@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* then a modify/write/read cycle when writing to a page in the
* page cache.
*
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
* The modify/write/read cycle may occur if a page is read before
* being completely filled by the writer. In this situation, the
* page must be completely written to stable storage on the server
@@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* and that the new data won't completely replace the old data in
* that range of the file.
*/
-static int nfs_want_read_modify_write(struct file *file, struct page *page,
- loff_t pos, unsigned len)
+static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
{
unsigned int pglen = nfs_page_length(page);
unsigned int offset = pos & (PAGE_SIZE - 1);
unsigned int end = offset + len;
- if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
- if (!PageUptodate(page))
- return 1;
- return 0;
- }
+ return !pglen || (end >= pglen && !offset);
+}
- if ((file->f_mode & FMODE_READ) && /* open for read? */
- !PageUptodate(page) && /* Uptodate? */
- !PagePrivate(page) && /* i/o request already? */
- pglen && /* valid bytes of file? */
- (end < pglen || offset)) /* replace all valid bytes? */
- return 1;
- return 0;
+static bool nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned int len)
+{
+ /*
+ * Up-to-date pages, those with ongoing or full-page write
+ * don't need read/modify/write
+ */
+ if (PageUptodate(page) || PagePrivate(page) ||
+ nfs_full_page_write(page, pos, len))
+ return false;
+
+ if (pnfs_ld_read_whole_page(file->f_mapping->host))
+ return true;
+ /* Open for reading too? */
+ if (file->f_mode & FMODE_READ)
+ return true;
+ return false;
}
/*
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 63abe705f4ca..f9264e1922a2 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
- const struct cred *cred;
+ const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
u32 ds_count, fh_count, id;
@@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
kcred->fsuid = uid;
kcred->fsgid = gid;
- cred = kcred;
+ cred = RCU_INITIALIZER(kcred);
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
@@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
}
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_available(devid);
+}
+
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx,
- int *best_idx)
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx,
+ bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
int idx;
- /* mirrors are sorted by efficiency */
+ /* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
if (idx+1 == fls->mirror_array_cnt)
- fail_return = true;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
- if (ds) {
- *best_idx = idx;
- return ds;
- }
+ fail_return = !check_device;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+ if (!ds)
+ continue;
+
+ if (check_device &&
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ continue;
+
+ *best_idx = idx;
+ return ds;
}
return NULL;
}
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ if (ds)
+ return ds;
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req,
@@ -925,7 +977,8 @@ retry:
goto out_mds;
for (i = 0; i < pgio->pg_mirror_count; i++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -936,7 +989,6 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
}
@@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
break;
case -NFS4ERR_RETRY_UNCACHED_REP:
break;
+ case -EAGAIN:
+ return -NFS4ERR_RESET_TO_PNFS;
/* Invalidate Layout errors */
case -NFS4ERR_PNFS_NO_LAYOUT:
case -ESTALE: /* mapped NFS4ERR_STALE */
@@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
case -EBADHANDLE:
case -ELOOP:
case -ENOSPC:
+ case -EAGAIN:
break;
case -EJUKEBOX:
nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
@@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
+ ff_layout_mark_ds_reachable(lseg, idx);
return 0;
+ }
/* Handle the case of an invalid layout segment */
if (!pnfs_is_valid_lseg(lseg))
@@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ if (status == NFS4ERR_NXIO)
+ ff_layout_mark_ds_unreachable(lseg, idx);
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -1249,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
if (ff_layout_choose_best_ds_for_read(hdr->lseg,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
- goto out_eagain;
+ goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1260,6 +1319,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
}
return 0;
+out_layouterror:
+ ff_layout_send_layouterror(hdr->lseg);
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -1293,15 +1354,6 @@ ff_layout_set_layoutcommit(struct inode *inode,
(unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
-static bool
-ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
-{
- /* No mirroring for now */
- struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
-
- return ff_layout_test_devid_unavailable(node);
-}
-
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1332,10 +1384,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
rpc_exit(task, -EIO);
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
@@ -1369,6 +1417,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
ff_layout_read_prepare_common(task, hdr);
}
+static void
+ff_layout_io_prepare_transmit(struct rpc_task *task,
+ void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (!pnfs_is_valid_lseg(hdr->lseg))
+ rpc_exit(task, -EAGAIN);
+}
+
static void ff_layout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
@@ -1399,9 +1457,10 @@ static void ff_layout_read_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_read(hdr);
pnfs_generic_rw_release(data);
}
@@ -1513,11 +1572,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
-
ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1573,9 +1627,10 @@ static void ff_layout_write_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
ff_layout_reset_write(hdr, true);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_write(hdr, false);
pnfs_generic_rw_release(data);
}
@@ -1657,6 +1712,7 @@ static void ff_layout_commit_release(void *data)
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1664,6 +1720,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1671,6 +1728,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1678,6 +1736,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1703,6 +1762,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
u32 idx = hdr->pgio_mirror_idx;
@@ -1713,20 +1773,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
__func__, hdr->inode->i_ino,
hdr->args.pgbase, (size_t)hdr->args.count, offset);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1734,13 +1795,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1770,26 +1829,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
int idx = hdr->pgio_mirror_idx;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1800,13 +1861,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1849,6 +1908,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
u32 idx;
int vers, ret;
@@ -1859,20 +1919,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_err;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
data->inode);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2036,7 +2097,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
dprintk("%s: Begin\n", __func__);
- xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
@@ -2102,6 +2163,52 @@ out_nomem:
return -ENOMEM;
}
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct nfs42_layout_error *errors;
+ LIST_HEAD(head);
+
+ if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+ return;
+ ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+ if (list_empty(&head))
+ return;
+
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
+ sizeof(*errors), GFP_NOFS);
+ if (errors != NULL) {
+ const struct nfs4_ff_layout_ds_err *pos;
+ size_t n = 0;
+
+ list_for_each_entry(pos, &head, list) {
+ errors[n].offset = pos->offset;
+ errors[n].length = pos->length;
+ nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+ errors[n].errors[0].dev_id = pos->deviceid;
+ errors[n].errors[0].status = pos->status;
+ errors[n].errors[0].opnum = pos->opnum;
+ n++;
+ if (!list_is_last(&pos->list, &head) &&
+ n < NFS42_LAYOUTERROR_MAX)
+ continue;
+ if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+ break;
+ n = 0;
+ }
+ kfree(errors);
+ }
+ ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index c2626bad466b..2f369966abf7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
generic_hdr);
}
-static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
-{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
- return NULL;
- return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
-}
-
static inline struct nfs4_ff_layout_ds *
FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
{
@@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
static inline struct nfs4_ff_layout_mirror *
FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
- return NULL;
- return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ if (idx < fls->mirror_array_cnt)
+ return fls->mirror_array[idx];
+ return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+ if (mirror != NULL) {
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ return &mirror_ds->id_node;
+ }
+ return NULL;
}
static inline u32
@@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
}
-static inline bool
-ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
-{
- /*
- * Flexfiles should never mark a DS unavailable, but if it does
- * print a (ratelimited) warning as this can affect performance.
- */
- if (nfs4_test_deviceid_unavailable(node)) {
- u32 *p = (u32 *)node->deviceid.data;
-
- pr_warn_ratelimited("NFS: flexfiles layout referencing an "
- "unavailable device [%x%x%x%x]\n",
- p[0], p[1], p[2], p[3]);
- return true;
- }
- return false;
-}
-
static inline int
-nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
{
- return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+ return mirror->mirror_ds->ds_versions[0].version;
}
struct nfs4_ff_layout_ds *
@@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_mirror *mirror, u64 offset,
u64 length, int status, enum nfs_opnum4 opnum,
gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
@@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return);
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
- u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
struct inode *inode);
-const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
- u32 ds_idx, const struct cred *mdscred);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 11766a74216d..a809989807d6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -183,56 +183,6 @@ out_err:
return NULL;
}
-static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
- struct nfs4_deviceid_node *devid)
-{
- nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
- if (!ff_layout_has_available_ds(lseg))
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
- lseg);
-}
-
-static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
- struct nfs4_ff_layout_mirror *mirror,
- bool create)
-{
- if (mirror == NULL || IS_ERR(mirror->mirror_ds))
- goto outerr;
- if (mirror->mirror_ds == NULL) {
- if (create) {
- struct nfs4_deviceid_node *node;
- struct pnfs_layout_hdr *lh = lseg->pls_layout;
- struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
-
- node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
- &mirror->devid, lh->plh_lc_cred,
- GFP_KERNEL);
- if (node)
- mirror_ds = FF_LAYOUT_MIRROR_DS(node);
-
- /* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
- mirror_ds != ERR_PTR(-ENODEV))
- nfs4_put_deviceid_node(node);
- } else
- goto outerr;
- }
-
- if (IS_ERR(mirror->mirror_ds))
- goto outerr;
-
- if (mirror->mirror_ds->ds == NULL) {
- struct nfs4_deviceid_node *devid;
- devid = &mirror->mirror_ds->id_node;
- ff_layout_mark_devid_invalid(lseg, devid);
- return false;
- }
- return true;
-outerr:
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
- return false;
-}
-
static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
u64 offset, u64 length)
{
@@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
ff_layout_add_ds_error_locked(flo, dserr);
spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-
return 0;
}
@@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
- struct nfs_fh *fh = NULL;
-
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
- }
-
/* FIXME: For now assume there is only 1 version available for the DS */
- fh = &mirror->fh_versions[0];
-out:
- return fh;
+ return &mirror->fh_versions[0];
}
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid)
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+ if (nfs4_ff_layout_ds_version(mirror) == 4)
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+}
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL)
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ struct nfs4_deviceid_node *node;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+ &mirror->devid, lo->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
}
- nfs4_stateid_copy(stateid, &mirror->stateid);
- return 1;
-out:
- return 0;
+ if (IS_ERR(mirror->mirror_ds))
+ goto outerr;
+
+ return true;
+outerr:
+ return false;
}
/**
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
- * @ds_idx: index of the DS to use
+ * @mirror: layout mirror describing the DS to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -407,26 +364,18 @@ out:
* Returns a pointer to a connected DS object on success or NULL on failure.
*/
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
struct nfs4_pnfs_ds *ds = NULL;
- struct nfs4_deviceid_node *devid;
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
int status;
- if (!ff_layout_mirror_valid(lseg, mirror, true)) {
- pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
- __func__, ds_idx);
- goto out;
- }
-
- devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
- goto out_fail;
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ goto noconnect;
ds = mirror->mirror_ds->ds;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
- dataserver_retrans,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ dataserver_timeo, dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
mirror->mirror_ds->ds_versions[0].minor_version);
@@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
-out_fail:
+noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
+ ff_layout_send_layouterror(lseg);
if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
ds = NULL;
@@ -466,14 +416,14 @@ out:
}
const struct cred *
-ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
const struct cred *mdscred)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
const struct cred *cred;
if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp, struct inode *inode)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-
switch (mirror->mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
@@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (IS_ERR(mirror->mirror_ds))
continue;
devid = &mirror->mirror_ds->id_node;
- if (!ff_layout_test_devid_unavailable(devid))
+ if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
}
@@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (!mirror->mirror_ds)
continue;
devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
+ if (nfs4_test_deviceid_unavailable(devid))
return false;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 094775ea0781..414a90d48493 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode);
/**
* nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
*/
int nfs_sync_mapping(struct address_space *mapping)
{
@@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode)
/**
* nfs_revalidate_inode - Revalidate the inode attributes
- * @server - pointer to nfs_server struct
- * @inode - pointer to inode struct
+ * @server: pointer to nfs_server struct
+ * @inode: pointer to inode struct
*
* Updates inode attribute information by retrieving the data from the server.
*/
@@ -1255,8 +1256,8 @@ out:
/**
* nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
*/
int nfs_revalidate_mapping(struct inode *inode,
struct address_space *mapping)
@@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/**
* nfs_check_inode_attributes - verify consistency of the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Verifies the attribute cache. If we have just changed the attributes,
* so that fattr carries weak cache consistency data, then it may
@@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
/**
* nfs_inode_attrs_need_update - check if the inode attributes need updating
- * @inode - pointer to inode
- * @fattr - attributes
+ * @inode: pointer to inode
+ * @fattr: attributes
*
* Attempt to divine whether or not an RPC call reply carrying stale
* attributes got scheduled after another call carrying updated ones.
@@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr
/**
* nfs_refresh_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Check that an RPC call that returned attributes has not overlapped with
* other recent updates of the inode metadata, then decide whether it is
@@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode,
/**
* nfs_post_op_update_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it.
@@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
/**
* nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
@@ -1731,8 +1732,8 @@ out_noforce:
/**
* nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b1e577302518..c7cf23ae6597 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,7 +69,8 @@ struct nfs_clone_mount {
* Maximum number of pages that readdir can use for creating
* a vmapped array of pages.
*/
-#define NFS_MAX_READDIR_PAGES 8
+#define NFS_MAX_READDIR_PAGES 64
+#define NFS_MAX_READDIR_RAPAGES 8
struct nfs_client_initdata {
unsigned long init_flags;
@@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
case -ERESTARTSYS:
+ case -EINTR:
case -EACCES:
case -EDQUOT:
case -EFBIG:
@@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err)
case -EROFS:
case -ESTALE:
case -E2BIG:
+ case -ENOMEM:
return true;
default:
return false;
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 9034b4926909..5088fda9b453 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_start_io_read - declare the file is being used for buffered reads
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode)
/**
* nfs_end_io_read - declare that the buffered read operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
@@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode)
/**
* nfs_start_io_write - declare the file is being used for buffered writes
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode)
/**
* nfs_end_io_write - declare that the buffered write operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
@@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_end_io_direct - declare the file is being used for direct i/o
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
@@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode)
/**
* nfs_end_io_direct - declare that the direct i/o operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e5686be67be8..15f099a24c29 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry - parent directory
- * @fh - filehandle for new root dentry
- * @fattr - attributes for new root inode
- * @authflavor - security flavor to use when performing the mount
+ * @dentry: parent directory
+ * @fh: filehandle for new root dentry
+ * @fattr: attributes for new root inode
+ * @authflavor: security flavor to use when performing the mount
*
*/
struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 350675e3ed47..a7ed29de0a40 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -22,6 +22,7 @@
#include <linux/nfs.h>
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -55,42 +56,16 @@
#define NFS_attrstat_sz (1+NFS_fattr_sz)
#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz (2)
-#define NFS_readres_sz (1+NFS_fattr_sz+1)
+#define NFS_readlinkres_sz (2+1)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+1)
#define NFS_writeres_sz (NFS_attrstat_sz)
#define NFS_stat_sz (1)
-#define NFS_readdirres_sz (1)
+#define NFS_readdirres_sz (1+1)
#define NFS_statfsres_sz (1+NFS_info_sz)
static int nfs_stat_to_errno(enum nfs_stat);
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv2 basic data types
*
* Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
@@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
if (unlikely(count > recvd))
@@ -125,9 +100,6 @@ out_cheating:
"count %u > recvd %u\n", count, recvd);
count = recvd;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, NFS2_FHSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = NFS2_FHSIZE;
memcpy(fh->data, p, NFS2_FHSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_V2;
@@ -325,9 +297,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p);
if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
goto out_size;
@@ -472,9 +438,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"length %u > received %u\n", length, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
const struct nfs_readlinkargs *args = data;
encode_fhandle(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS_readlinkres_sz);
}
/*
@@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
const struct nfs_pgio_args *args = data;
encode_readargs(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, NFS_readres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, NFS_readres_sz);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
const struct nfs_readdirargs *args = data;
encode_readdirargs(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
- args->count, NFS_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS_readdirres_sz);
}
/*
@@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
int error;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
}
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->ino = be32_to_cpup(p);
error = decode_filename_inline(xdr, &entry->name, &entry->len);
@@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
*/
entry->prev_cookie = entry->cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->cookie = be32_to_cpup(p);
entry->d_type = DT_UNKNOWN;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_info_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->tsize = be32_to_cpup(p++);
result->bsize = be32_to_cpup(p++);
result->blocks = be32_to_cpup(p++);
result->bfree = be32_to_cpup(p++);
result->bavail = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9fce18548f7e..c5c3fc6e6c60 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
switch (status) {
case 0:
status = nfs_refresh_inode(inode, fattr);
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
- set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 78df4eb60f85..110358f4986d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -21,6 +21,7 @@
#include <linux/nfs3.h>
#include <linux/nfs_fs.h>
#include <linux/nfsacl.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -68,13 +69,13 @@
#define NFS3_removeres_sz (NFS3_setattrres_sz)
#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
@@ -84,7 +85,7 @@
#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
- XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
static int nfs3_stat_to_errno(enum nfs_stat);
@@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = {
};
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv3 basic data types
*
* Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
@@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*value = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_uint64(struct xdr_stream *xdr, u64 *value)
@@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 8);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
xdr_decode_hyper(p, value);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
@@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
goto out_nametoolong;
@@ -267,9 +233,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"count %u > recvd %u\n", count, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p++);
if (unlikely(length > NFS3_FHSIZE))
goto out_toobig;
p = xdr_inline_decode(xdr, length);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = length;
memcpy(fh->data, p, length);
return 0;
out_toobig:
dprintk("NFS: file handle size (%u) too big\n", length);
return -E2BIG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static void zero_nfs_fh3(struct nfs_fh *fh)
@@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_ftype3(p, &fmode);
@@ -690,9 +647,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_fattr3(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_PRESIZE
| NFS_ATTR_FATTR_PRECHANGE
@@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_wcc_attr(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
@@ -808,15 +753,12 @@ out:
static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_nfs_fh3(xdr, fh);
zero_nfs_fh3(fh);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
const struct nfs3_readlinkargs *args = data;
encode_nfs_fh3(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS3_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS3_readlinkres_sz);
}
/*
@@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
encode_read3args(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, replen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdir3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdirplus3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
encode_nfs_fh3(xdr, args->fh);
encode_uint32(xdr, args->mask);
if (args->mask & (NFS_ACL | NFS_DFACL)) {
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
NFSACL_MAXPAGES << PAGE_SHIFT,
ACL3_getaclres_sz);
req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
@@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p++);
eof = be32_to_cpup(p++);
ocount = be32_to_cpup(p++);
@@ -1667,9 +1609,6 @@ out_cheating:
count = recvd;
eof = 0;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
- result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
error = decode_read3resok(xdr, result);
out:
return error;
@@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->count = be32_to_cpup(p++);
result->verf->committed = be32_to_cpup(p++);
if (unlikely(result->verf->committed > NFS_FILE_SYNC))
goto out_badvalue;
if (decode_writeverf3(xdr, &result->verf->verifier))
- goto out_eio;
+ return -EIO;
return result->count;
out_badvalue:
dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out_eio:
- return -EIO;
}
static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
u64 new_cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
if (unlikely(error)) {
@@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
out_truncated:
dprintk("NFS: directory entry contains invalid file handle\n");
*entry = old;
@@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 8 * 6 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_size3(p, &result->tbytes);
p = xdr_decode_size3(p, &result->fbytes);
p = xdr_decode_size3(p, &result->abytes);
@@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
xdr_decode_size3(p, &result->afiles);
/* ignore invarsec */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
@@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->rtmax = be32_to_cpup(p++);
result->rtpref = be32_to_cpup(p++);
result->rtmult = be32_to_cpup(p++);
@@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
@@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 6);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->max_link = be32_to_cpup(p++);
result->max_namelen = be32_to_cpup(p);
/* ignore remaining fields */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 19ec38f85ce0..901cca7542f9 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors,
+ size_t n);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd9998d..ff6f85fb676b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -672,6 +672,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return 0;
}
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+ struct nfs42_layouterror_data *data;
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (data) {
+ data->args.inode = data->inode = nfs_igrab_and_active(inode);
+ if (data->inode) {
+ data->lseg = pnfs_get_lseg(lseg);
+ if (data->lseg)
+ return data;
+ nfs_iput_and_deactive(data->inode);
+ }
+ kfree(data);
+ }
+ return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+ pnfs_put_lseg(data->lseg);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+ unsigned i;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ for (i = 0; i < data->args.num_errors; i++)
+ nfs4_stateid_copy(&data->args.errors[i].stateid,
+ &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.errors[0].stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+ }
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+
+ nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+ .rpc_call_prepare = nfs42_layouterror_prepare,
+ .rpc_call_done = nfs42_layouterror_done,
+ .rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors, size_t n)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct nfs42_layouterror_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layouterror_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int i;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+ return -EOPNOTSUPP;
+ if (n > NFS42_LAYOUTERROR_MAX)
+ return -EINVAL;
+ data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ data->args.errors[i] = errors[i];
+ data->args.num_errors++;
+ data->res.num_errors++;
+ }
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup.callback_data = data;
+ task_setup.rpc_client = NFS_SERVER(inode)->client;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct file *dst_f, struct nfs_lock_context *src_lock,
struct nfs_lock_context *dst_lock, loff_t src_offset,
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 69f72ed2bf87..aed865a84629 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -51,6 +51,15 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ 1 /* Array size */ + \
+ encode_device_error_maxsz)
+#define decode_layouterror_maxsz (op_decode_hdr_maxsz)
#define encode_clone_maxsz (encode_stateid_maxsz + \
encode_stateid_maxsz + \
2 /* src offset */ + \
@@ -59,43 +68,53 @@
#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_allocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
encode_copy_maxsz + \
encode_commit_maxsz)
#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
decode_copy_maxsz + \
decode_commit_maxsz)
#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_offload_cancel_maxsz)
#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_seek_maxsz)
#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_seek_maxsz)
#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
@@ -106,6 +125,16 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ decode_layouterror_maxsz)
#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr,
xdr_encode_hyper(p, args->count);
}
+static void encode_device_error(struct xdr_stream *xdr,
+ const struct nfs42_device_error *error)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+ p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(error->status);
+ *p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+ const struct nfs42_layout_error *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, args->offset);
+ p = xdr_encode_hyper(p, args->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(1);
+ encode_device_error(xdr, &args->errors[0]);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layouterror_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ int i;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ for (i = 0; i < args->num_errors; i++)
+ encode_layouterror(xdr, &args->errors[i], &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
count = be32_to_cpup(p);
if (count > 1)
return -EREMOTEIO;
@@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr,
status = decode_opaque_fixed(xdr, &res->stateid,
NFS4_STATEID_SIZE);
if (unlikely(status))
- goto out_overflow;
+ return -EIO;
}
p = xdr_inline_decode(xdr, 8 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier);
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy_requirements(struct xdr_stream *xdr,
@@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->consecutive = be32_to_cpup(p++);
res->synchronous = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
@@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
p = xdr_inline_decode(xdr, 4 + 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->sr_eof = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &res->sr_offset);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutstats(struct xdr_stream *xdr)
@@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_CLONE);
}
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -704,4 +776,30 @@ out:
return status;
}
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layouterror_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+
+ for (i = 0; i < res->num_errors && status == 0; i++)
+ status = decode_layouterror(xdr);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 2548405da1f7..1339ede979af 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
}
#ifdef CONFIG_NFS_V4_1
-/**
+/*
* Per auth flavor data server rpc clients
*/
struct nfs4_ds_server {
@@ -51,7 +51,9 @@ struct nfs4_ds_server {
};
/**
- * Common lookup case for DS I/O
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
*/
static struct nfs4_ds_server *
nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
@@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss)
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
{
@@ -145,7 +151,6 @@ static void
nfs4_shutdown_ds_clients(struct nfs_client *clp)
{
struct nfs4_ds_server *dss;
- LIST_HEAD(shutdown_list);
while (!list_empty(&clp->cl_ds_clients)) {
dss = list_entry(clp->cl_ds_clients.next,
@@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
/**
* nfs40_init_client - nfs_client initialization tasks for NFSv4.0
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp)
/**
* nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* nfs4_init_client - Initialise an NFS4 client record
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: callback IP address in presentation format
- * @authflavor: authentication flavor for underlying RPC transport
+ * @cl_init: pointer to nfs_client_initdata
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
@@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1,
/**
* nfs4_detect_session_trunking - Checks for session trunking.
- *
- * Called after a successful EXCHANGE_ID on a multi-addr connection.
- * Upon success, add the transport.
- *
* @clp: original mount nfs_client
* @res: result structure from an exchange_id using the original mount
* nfs_client with a new multi_addr transport
+ * @xprt: pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
*
* Returns zero on success, otherwise -EINVAL
*
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b6840547..bf34ddaa2ad7 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -44,6 +44,7 @@
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
#include <linux/module.h>
#include "internal.h"
@@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy;
struct idmap_legacy_upcalldata {
struct rpc_pipe_msg pipe_msg;
struct idmap_msg idmap_msg;
- struct key_construction *key_cons;
+ struct key *authkey;
struct idmap *idmap;
};
@@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = {
{ Opt_find_err, NULL }
};
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_release_pipe(struct inode *);
@@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
static void
nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
{
- struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+ struct key *authkey = idmap->idmap_upcall_data->authkey;
kfree(idmap->idmap_upcall_data);
idmap->idmap_upcall_data = NULL;
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
+ key_put(authkey);
}
static void
@@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
}
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
- const char *op,
- void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
{
struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
struct idmap *idmap = (struct idmap *)aux;
- struct key *key = cons->key;
+ struct key *key = rka->target_key;
int ret = -ENOKEY;
if (!aux)
@@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
msg = &data->pipe_msg;
im = &data->idmap_msg;
data->idmap = idmap;
- data->key_cons = cons;
+ data->authkey = key_get(authkey);
ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
if (ret < 0)
@@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
out2:
kfree(data);
out1:
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
return ret;
}
@@ -651,9 +652,10 @@ out:
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
+ struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
- struct key_construction *cons;
+ struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
int ret = -ENOKEY;
@@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (idmap->idmap_upcall_data == NULL)
goto out_noupcall;
- cons = idmap->idmap_upcall_data->key_cons;
+ authkey = idmap->idmap_upcall_data->authkey;
+ rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
ret = -ENOSPC;
@@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
ret = nfs_idmap_read_and_verify_message(&im,
&idmap->idmap_upcall_data->idmap_msg,
- cons->key, cons->authkey);
+ rka->target_key, authkey);
if (ret >= 0) {
- key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24f06dcc2b08..2e460c33ae48 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
/**
* nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
* @server: NFS server struct
* @flavors: List of security tuples returned by SECINFO procedure
*
@@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry - parent directory
- * @locations - array of NFSv4 server location information
+ * @dentry: parent directory
+ * @locations: array of NFSv4 server location information
*
*/
static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 557a5d636183..4dbb0ee23432 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
res->sr_slot = NULL;
}
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+ slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ slot->seq_nr_highest_sent = seqnr;
+ slot->seq_nr_last_acked = seqnr;
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot *slot = res->sr_slot;
struct nfs_client *clp;
- bool interrupted = false;
int ret = 1;
if (slot == NULL)
goto out_noaction;
/* don't increment the sequence number if the task wasn't sent */
- if (!RPC_WAS_SENT(task))
+ if (!RPC_WAS_SENT(task) || slot->seq_done)
goto out;
session = slot->table->session;
- if (slot->interrupted) {
- if (res->sr_status != -NFS4ERR_DELAY)
- slot->interrupted = 0;
- interrupted = true;
- }
-
trace_nfs4_sequence_done(session, res);
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
+ /* Mark this sequence number as having been acked */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
clp = session->clp;
@@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task,
* sr_status remains 1 if an RPC level error occurred.
* The server may or may not have processed the sequence
* operation..
- * Mark the slot as having hosted an interrupted RPC call.
*/
- slot->interrupted = 1;
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ slot->seq_done = 1;
goto out;
case -NFS4ERR_DELAY:
/* The server detected a resend of the RPC call and
@@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
* The server thinks we tried to replay a request.
* Retry the call after bumping the sequence ID.
*/
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto retry_new_seq;
case -NFS4ERR_BADSLOT:
/*
@@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto session_recover;
goto retry_nowait;
case -NFS4ERR_SEQ_MISORDERED:
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
/*
- * Was the last operation on this sequence interrupted?
- * If so, retry after bumping the sequence number.
- */
- if (interrupted)
- goto retry_new_seq;
- /*
- * Could this slot have been previously retired?
- * If so, then the server may be expecting seq_nr = 1!
+ * Were one or more calls using this slot interrupted?
+ * If the server never received the request, then our
+ * transmitted slot sequence number may be too high.
*/
- if (slot->seq_nr != 1) {
- slot->seq_nr = 1;
+ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+ slot->seq_nr--;
goto retry_nowait;
}
- goto session_recover;
+ /*
+ * RFC5661:
+ * A retry might be sent while the original request is
+ * still in progress on the replier. The replier SHOULD
+ * deal with the issue by returning NFS4ERR_DELAY as the
+ * reply to SEQUENCE or CB_SEQUENCE operation, but
+ * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+ *
+ * Restart the search after a delay.
+ */
+ slot->seq_nr = slot->seq_nr_highest_sent;
+ goto out_retry;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_done = nfs41_call_sync_done,
};
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
-{
- struct rpc_task *task;
-
- task = _nfs41_proc_sequence(client, cred, slot, true);
- if (!IS_ERR(task))
- rpc_put_task_async(task);
-}
-
#else /* !CONFIG_NFS_V4_1 */
static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task,
}
EXPORT_SYMBOL_GPL(nfs4_sequence_done);
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
{
- WARN_ON_ONCE(1);
- slot->interrupted = 0;
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
}
-#endif /* !CONFIG_NFS_V4_1 */
-
static
void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
@@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
args->sa_slot = slot;
res->sr_slot = slot;
- res->sr_timestamp = jiffies;
- res->sr_status_flags = 0;
- res->sr_status = 1;
-
}
int nfs4_setup_sequence(struct nfs_client *client,
@@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client,
task->tk_timeout = 0;
}
- for (;;) {
- spin_lock(&tbl->slot_tbl_lock);
- /* The state manager will wait until the slot table is empty */
- if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
- goto out_sleep;
-
- slot = nfs4_alloc_slot(tbl);
- if (IS_ERR(slot)) {
- /* Try again in 1/4 second */
- if (slot == ERR_PTR(-ENOMEM))
- task->tk_timeout = HZ >> 2;
- goto out_sleep;
- }
- spin_unlock(&tbl->slot_tbl_lock);
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
- if (likely(!slot->interrupted))
- break;
- nfs4_sequence_process_interrupted(client,
- slot, task->tk_msg.rpc_cred);
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ /* Try again in 1/4 second */
+ if (slot == ERR_PTR(-ENOMEM))
+ task->tk_timeout = HZ >> 2;
+ goto out_sleep;
}
+ spin_unlock(&tbl->slot_tbl_lock);
nfs4_sequence_attach_slot(args, res, slot);
trace_nfs4_setup_sequence(session, args);
out_start:
+ nfs41_sequence_res_init(res);
rpc_call_start(task);
return 0;
@@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
static void nfs_set_open_stateid_locked(struct nfs4_state *state,
const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ __must_hold(&state->owner->so_lock)
+ __must_hold(&state->seqlock)
+ __must_hold(RCU)
+
{
DEFINE_WAIT(wait);
int status = 0;
@@ -5963,7 +5962,7 @@ out:
/**
* nfs4_proc_setclientid_confirm - Confirm client ID
* @clp: state data structure
- * @res: result of a previous SETCLIENTID
+ * @arg: result of a previous SETCLIENTID
* @cred: credential to use for this call
*
* Returns zero, a negative errno, or a negative NFS4ERR status code.
@@ -7527,7 +7526,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
return status;
}
-/**
+/*
* If 'use_integrity' is true and the state managment nfs_client
* cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
* and the machine credential as per RFC3530bis and RFC5661 Security
@@ -8937,10 +8936,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
if (status != 0)
goto out;
- /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
- if (task->tk_status < 0 || lgp->res.layoutp->len == 0) {
+ if (task->tk_status < 0) {
status = nfs4_layoutget_handle_exception(task, lgp, &exception);
*timeout = exception.timeout;
+ } else if (lgp->res.layoutp->len == 0) {
+ status = -EAGAIN;
+ *timeout = nfs4_update_delay(&exception.timeout);
} else
lseg = pnfs_layout_process(lgp);
out:
@@ -9219,7 +9220,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return status;
}
-/**
+/*
* Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
* possible) as per RFC3530bis and RFC5661 Security Considerations sections
*/
@@ -9484,7 +9485,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
* @cred: credential
- * @is_recovery: set to true if this call needs to be privileged
+ * @privileged: set to true if this call needs to be privileged
*
* Note: this function is always asynchronous.
*/
@@ -9691,7 +9692,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
- | NFS_CAP_CLONE,
+ | NFS_CAP_CLONE
+ | NFS_CAP_LAYOUTERROR,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a5489d70a724..bcb532def9e2 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
/**
* nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
- * @tbl - controlling slot table
+ * @tbl: controlling slot table
*
*/
void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
@@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
slot->table = tbl;
slot->slot_nr = slotid;
slot->seq_nr = seq_init;
+ slot->seq_nr_highest_sent = seq_init;
+ slot->seq_nr_last_acked = seq_init - 1;
}
return slot;
}
@@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
p = &tbl->slots;
while (*p) {
(*p)->seq_nr = ivalue;
- (*p)->interrupted = 0;
+ (*p)->seq_nr_highest_sent = ivalue;
+ (*p)->seq_nr_last_acked = ivalue - 1;
p = &(*p)->next;
}
tbl->highest_used_slotid = NFS4_NO_SLOT;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3c550f297561..b996ee23f1ba 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -10,7 +10,7 @@
/* maximum number of slots to use */
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
-#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -23,8 +23,9 @@ struct nfs4_slot {
unsigned long generation;
u32 slot_nr;
u32 seq_nr;
- unsigned int interrupted : 1,
- privileged : 1,
+ u32 seq_nr_last_acked;
+ u32 seq_nr_highest_sent;
+ unsigned int privileged : 1,
seq_done : 1;
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 02488b50534a..3de36479ed7a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
* nfs4_get_state_owner - Look up a state owner given a credential
* @server: nfs_server to search
* @cred: RPC credential to match
+ * @gfp_flags: allocation mode
*
* Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
*/
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b4557cf685fb..cd1a5c08da9a 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence,
)
);
+TRACE_EVENT(nfs4_xdr_status,
+ TP_PROTO(
+ u32 op,
+ int error
+ ),
+
+ TP_ARGS(op, error),
+
+ TP_STRUCT__entry(
+ __field(u32, op)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->op = op;
+ __entry->error = -error;
+ ),
+
+ TP_printk(
+ "operation %d: nfs status %d (%s)",
+ __entry->op,
+ __entry->error, show_nfsv4_errors(__entry->error)
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_open_event,
TP_PROTO(
const struct nfs_open_context *ctx,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2fc8f6fa25e4..cfcabc33e24d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -54,6 +54,7 @@
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
+#include "nfs4trace.h"
#include "internal.h"
#include "nfs4idmap.h"
#include "nfs4session.h"
@@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
nfs4_fattr_bitmap_maxsz)
#define encode_read_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + 1)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
-#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 4)
#define decode_write_maxsz (op_decode_hdr_maxsz + \
@@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
#define encode_getacl_maxsz (encode_getattr_maxsz)
#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
- nfs4_fattr_bitmap_maxsz + 1)
+ nfs4_fattr_bitmap_maxsz + 1 + 1)
#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
#define decode_setacl_maxsz (decode_setattr_maxsz)
#define encode_fs_locations_maxsz \
(encode_getattr_maxsz)
#define decode_fs_locations_maxsz \
- (0)
+ (1)
#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
@@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap, word 0 */)
+ 1 /* notification bitmap, word 0 */ + \
+ 1 /* possible XDR padding */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
decode_stateid_maxsz + \
- XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
2 /* offset */ + \
2 /* length */ + \
@@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- struct rpc_auth *auth = req->rq_cred->cr_auth;
/* initialize running count of expected bytes in reply.
* NOTE: the replied tag SHOULD be the same is the one sent,
* but this is not required as a MUST for the server to do so. */
- hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+ hdr->replen = 3 + hdr->taglen;
WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
encode_string(xdr, hdr->taglen, hdr->tag);
@@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readlink(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->pglen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readdir(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
- dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
- __func__, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
encode_nops(&hdr);
}
@@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_read(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->pages, args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
encode_nops(&hdr);
}
@@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr(xdr, nfs4_acl_bitmap, NULL,
ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, 0, args->acl_len);
-
+ rpc_prepare_reply_pages(req, args->acl_pages, 0,
+ args->acl_len, replen);
encode_nops(&hdr);
}
@@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
encode_fs_locations(xdr, args->bitmask, &hdr);
}
- /* Set up reply kvec to capture returned fs_locations array. */
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- (struct page **)&args->page, 0, PAGE_SIZE);
+ rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+ PAGE_SIZE, replen);
encode_nops(&hdr);
}
@@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
/* set up reply kvec. Subtract notification bitmap max size (2)
* so that notification bitmap is put in xdr_buf tail */
- xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
- args->pdev->pages, args->pdev->pgbase,
- args->pdev->pglen);
-
+ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+ args->pdev->pglen, hdr.replen - 2);
encode_nops(&hdr);
}
@@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
encode_layoutget(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->layout.pages, 0, args->layout.pglen);
-
+ rpc_prepare_reply_pages(req, args->layout.pages, 0,
+ args->layout.pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
}
#endif /* CONFIG_NFS_V4_1 */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("nfs: %s: prematurely hit end of receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
NFS4_OPAQUE_LIMIT);
- if (unlikely(ret < 0)) {
- if (ret == -EBADMSG)
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
*len = ret;
return 0;
}
@@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->status = be32_to_cpup(p++);
hdr->taglen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, hdr->taglen + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
hdr->nops = be32_to_cpup(p);
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
@@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
opnum = be32_to_cpup(p++);
if (unlikely(opnum != expected))
goto out_bad_operation;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *nfs_retval = 0;
+ return true;
+out_status:
nfserr = be32_to_cpup(p);
- if (nfserr == NFS_OK)
- *nfs_retval = 0;
- else
- *nfs_retval = nfs4_stat_to_errno(nfserr);
+ trace_nfs4_xdr_status(opnum, nfserr);
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
return true;
out_bad_operation:
dprintk("nfs: Server returned operation"
@@ -3214,7 +3197,6 @@ out_bad_operation:
*nfs_retval = -EREMOTEIO;
return false;
out_overflow:
- print_overflow_msg(__func__, xdr);
*nfs_retval = -EIO;
return false;
}
@@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace)
char *str;
p = xdr_inline_decode(xdr, 12);
- if (likely(p))
- return decode_opaque_inline(xdr, &strlen, &str);
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (unlikely(!p))
+ return -EIO;
+ return decode_opaque_inline(xdr, &strlen, &str);
}
static ssize_t
@@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
if (likely(ret >= 0))
return ret;
- if (ret == -EMSGSIZE)
- return sz;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (ret != -EMSGSIZE)
+ return -EIO;
+ return sz;
}
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*attrlen = be32_to_cpup(p);
*savep = xdr_stream_pos(xdr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
if (*type < NF4REG || *type > NF4NAMEDATTR) {
dprintk("%s: bad type %d\n", __func__, *type);
@@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
}
dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
@@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
}
dprintk("%s: expire type=0x%x\n", __func__, *type);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
ret = NFS_ATTR_FATTR_CHANGE;
@@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
p = xdr_inline_decode(xdr, 16);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &fsid->major);
xdr_decode_hyper(p, &fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
}
dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
@@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *
if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
*res = -be32_to_cpup(p);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
@@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (fh != NULL) {
memcpy(fh->data, p, len);
fh->size = len;
@@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
}
dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
}
dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
}
dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
}
dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
n = be32_to_cpup(p);
if (n == 0)
goto root_path;
@@ -3718,9 +3647,6 @@ out_eio:
dprintk(" status %d", status);
status = -EIO;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
goto out;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
@@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
loc = &res->locations[res->nlocations];
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
m = be32_to_cpup(p);
dprintk("%s: servers:\n", __func__);
@@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
out_eio:
status = -EIO;
goto out;
@@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
}
dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxlink = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
}
dprintk("%s: maxlink=%u\n", __func__, *maxlink);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxname = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
}
dprintk("%s: maxname=%u\n", __func__, *maxname);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
uint64_t maxread;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxread);
if (maxread > 0x7FFFFFFF)
maxread = 0x7FFFFFFF;
@@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
}
dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
uint64_t maxwrite;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxwrite);
if (maxwrite > 0x7FFFFFFF)
maxwrite = 0x7FFFFFFF;
@@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
}
dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
tmp = be32_to_cpup(p);
*mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*nlink = be32_to_cpup(p);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
@@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_OWNER;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_GROUP;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
major = be32_to_cpup(p++);
minor = be32_to_cpup(p);
tmp = MKDEV(major, minor);
@@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
}
dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
}
dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
}
dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static __be32 *
@@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_nfstime4(p, time);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
lfs = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pi = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
memcpy(label->label, p, len);
@@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
(char *)label->label, label->len, label->pi, label->lfs);
return status;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
p = xdr_inline_decode(xdr, 20);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
cinfo->atomic = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &cinfo->before);
xdr_decode_hyper(p, &cinfo->after);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
@@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
supp = be32_to_cpup(p++);
acc = be32_to_cpup(p);
*supported = supp;
*access = acc;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
{
ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
- if (unlikely(ret < 0)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
return 0;
}
@@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bmlen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, bmlen << 2);
if (likely(p))
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr,
if (likely(bitmap[0] & hint_bit)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_first_threshold_item4(struct xdr_stream *xdr,
@@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr,
/* layout type */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
res->l_type = be32_to_cpup(p);
/* thi_hintset bitmap */
@@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
return -EREMOTEIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num = be32_to_cpup(p);
if (num == 0)
return 0;
@@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
}
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
fsinfo->nlayouttypes = be32_to_cpup(p);
/* pNFS is not supported by the underlying file system */
@@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
/* Decode and set first layout type, move xdr->p past unused types */
p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* If we get too many, then just cap it at the max */
if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
@@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
for(i = 0; i < fsinfo->nlayouttypes; ++i)
fsinfo->layouttype[i] = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
}
@@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
}
@@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
fh->size = len;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(fh->data, p, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
p = xdr_decode_hyper(p, &length);
type = be32_to_cpup(p++); /* 4 byte read */
@@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
p = xdr_inline_decode(xdr, namelen); /* variable size field */
- if (likely(p))
- return -NFS4ERR_DENIED;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (likely(!p))
+ return -EIO;
+ return -NFS4ERR_DENIED;
}
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
case NFS4_LIMIT_SIZE:
@@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr,
maxsize >>= PAGE_SHIFT;
*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_rw_delegation(struct xdr_stream *xdr,
@@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->do_recall = be32_to_cpup(p);
switch (delegation_type) {
@@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return -EIO;
}
return decode_ace(xdr, NULL);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
why_no_delegation = be32_to_cpup(p);
switch (why_no_delegation) {
case WND4_CONTENTION:
@@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
/* Ignore for now */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
delegation_type = be32_to_cpup(p);
res->delegation_type = 0;
switch (delegation_type) {
@@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
return decode_no_delegation(xdr, res);
}
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->rflags = be32_to_cpup(p++);
bmlen = be32_to_cpup(p);
if (bmlen > 10)
@@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, bmlen << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
res->attrset[i] = be32_to_cpup(p++);
@@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
eof = be32_to_cpup(p++);
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
@@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
res->eof = eof;
res->count = count;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
/* Convert length of symlink */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len >= rcvbuf->page_len || len <= 0) {
dprintk("nfs: server returned giant symlink!\n");
@@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
*/
xdr_terminate_string(rcvbuf, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr)
return status;
if (decode_bitmap4(xdr, NULL, 0) >= 0)
return 0;
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
opnum = be32_to_cpup(p++);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
@@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
if (nfserr == NFS_OK) {
p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->clientid);
memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
@@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
/* skip netid string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* skip uaddr string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
return -NFSERR_CLID_INUSE;
} else
return nfs4_stat_to_errno(nfserr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->count = be32_to_cpup(p++);
res->verf->committed = be32_to_cpup(p++);
return decode_write_verifier(xdr, &res->verf->verifier);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegreturn(struct xdr_stream *xdr)
@@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
oid_len = be32_to_cpup(p);
if (oid_len > GSS_OID_MAX_LEN)
- goto out_err;
+ return -EINVAL;
p = xdr_inline_decode(xdr, oid_len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(flavor->flavor_info.oid.data, p, oid_len);
flavor->flavor_info.oid.len = oid_len;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
flavor->flavor_info.qop = be32_to_cpup(p++);
flavor->flavor_info.service = be32_to_cpup(p);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-out_err:
- return -EINVAL;
}
static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->flavors->num_flavors = 0;
num_flavors = be32_to_cpup(p);
@@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sec_flavor->flavor = be32_to_cpup(p);
if (sec_flavor->flavor == RPC_AUTH_GSS) {
@@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
status = 0;
out:
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &res->clientid);
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p++);
@@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* server_owner4.so_minor_id */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->server_owner->minor_id);
/* server_owner4.so_major_id */
@@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* Implementation Id */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
impl_id_count = be32_to_cpup(p++);
if (impl_id_count) {
@@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* nii_date */
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
res->impl_id->date.nseconds = be32_to_cpup(p);
/* if there's more than one entry, ignore the rest */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 28);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
val = be32_to_cpup(p++); /* headerpadsz */
if (val)
return -EINVAL; /* no support for header padding yet */
@@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
if (nr_attrs == 1) {
p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
/* dir flags, rdma mode bool */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->dir = be32_to_cpup(p++);
if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
@@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
res->use_conn_in_rdma_mode = true;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_create_session(struct xdr_stream *xdr,
@@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr,
/* seqid, flags */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p);
@@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr,
if (!status)
status = decode_chan_attrs(xdr, &res->bc_attrs);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -5967,7 +5767,6 @@ out_err:
res->sr_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out_err;
#else /* CONFIG_NFS_V4_1 */
@@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
if (status == -ETOOSMALL) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pdev->mincount = be32_to_cpup(p);
dprintk("%s: Min count too small. mincnt = %u\n",
__func__, pdev->mincount);
@@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
type = be32_to_cpup(p++);
if (type != pdev->layout_type) {
dprintk("%s: layout mismatch req: %u pdev: %u\n",
@@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
*/
pdev->mincount = be32_to_cpup(p);
if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
- goto out_overflow;
+ return -EIO;
/* Parse notification bitmap, verifying that it is zero. */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len) {
uint32_t i;
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->notification = be32_to_cpup(p++);
for (i = 1; i < len; i++) {
@@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
}
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
@@ -6115,7 +5911,6 @@ out:
res->status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_layout_stateid(xdr, &res->stateid);
else
nfs4_stateid_copy(&res->stateid, &invalid_stateid);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutcommit(struct xdr_stream *xdr,
@@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sizechanged = be32_to_cpup(p);
if (sizechanged) {
/* throw away new size */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_test_stateid(struct xdr_stream *xdr,
@@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num_res = be32_to_cpup(p++);
if (num_res != 1)
- goto out;
+ return -EIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->status = be32_to_cpup(p++);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out:
- return -EIO;
}
static int decode_free_stateid(struct xdr_stream *xdr,
@@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
uint64_t new_cookie;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
p = xdr_decode_hyper(p, &new_cookie);
entry->len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, entry->len);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
entry->name = (const char *) p;
/*
@@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
if (decode_attr_bitmap(xdr, bitmap) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_attr_length(xdr, &len, &savep) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
NULL, entry->label, entry->server) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
@@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->cookie = new_cookie;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(COPY, enc_copy, dec_copy),
PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
+ PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index b60d5fbd7727..a90b363500c2 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -11,3 +11,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index bd60f8d1e181..a0d6910aa03a 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done,
)
);
+TRACE_DEFINE_ENUM(NFS_OK);
+TRACE_DEFINE_ENUM(NFSERR_PERM);
+TRACE_DEFINE_ENUM(NFSERR_NOENT);
+TRACE_DEFINE_ENUM(NFSERR_IO);
+TRACE_DEFINE_ENUM(NFSERR_NXIO);
+TRACE_DEFINE_ENUM(NFSERR_ACCES);
+TRACE_DEFINE_ENUM(NFSERR_EXIST);
+TRACE_DEFINE_ENUM(NFSERR_XDEV);
+TRACE_DEFINE_ENUM(NFSERR_NODEV);
+TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFSERR_ISDIR);
+TRACE_DEFINE_ENUM(NFSERR_INVAL);
+TRACE_DEFINE_ENUM(NFSERR_FBIG);
+TRACE_DEFINE_ENUM(NFSERR_NOSPC);
+TRACE_DEFINE_ENUM(NFSERR_ROFS);
+TRACE_DEFINE_ENUM(NFSERR_MLINK);
+TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFSERR_DQUOT);
+TRACE_DEFINE_ENUM(NFSERR_STALE);
+TRACE_DEFINE_ENUM(NFSERR_REMOTE);
+TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
+TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
+TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
+
+#define nfs_show_status(x) \
+ __print_symbolic(x, \
+ { NFS_OK, "OK" }, \
+ { NFSERR_PERM, "PERM" }, \
+ { NFSERR_NOENT, "NOENT" }, \
+ { NFSERR_IO, "IO" }, \
+ { NFSERR_NXIO, "NXIO" }, \
+ { NFSERR_ACCES, "ACCES" }, \
+ { NFSERR_EXIST, "EXIST" }, \
+ { NFSERR_XDEV, "XDEV" }, \
+ { NFSERR_NODEV, "NODEV" }, \
+ { NFSERR_NOTDIR, "NOTDIR" }, \
+ { NFSERR_ISDIR, "ISDIR" }, \
+ { NFSERR_INVAL, "INVAL" }, \
+ { NFSERR_FBIG, "FBIG" }, \
+ { NFSERR_NOSPC, "NOSPC" }, \
+ { NFSERR_ROFS, "ROFS" }, \
+ { NFSERR_MLINK, "MLINK" }, \
+ { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFSERR_DQUOT, "DQUOT" }, \
+ { NFSERR_STALE, "STALE" }, \
+ { NFSERR_REMOTE, "REMOTE" }, \
+ { NFSERR_WFLUSH, "WFLUSH" }, \
+ { NFSERR_BADHANDLE, "BADHANDLE" }, \
+ { NFSERR_NOT_SYNC, "NOTSYNC" }, \
+ { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
+ { NFSERR_NOTSUPP, "NOTSUPP" }, \
+ { NFSERR_TOOSMALL, "TOOSMALL" }, \
+ { NFSERR_SERVERFAULT, "REMOTEIO" }, \
+ { NFSERR_BADTYPE, "BADTYPE" }, \
+ { NFSERR_JUKEBOX, "JUKEBOX" })
+
+TRACE_EVENT(nfs_xdr_status,
+ TP_PROTO(
+ int error
+ ),
+
+ TP_ARGS(error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s)",
+ __entry->error, nfs_show_status(__entry->error)
+ )
+);
+
#endif /* _TRACE_NFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e54d899c1848..e9f39fa5964b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
/**
* nfs_unlock_request - Unlock request and wake up sleepers.
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_request(struct nfs_page *req)
{
@@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req)
/**
* nfs_unlock_and_release_request - Unlock request and release the nfs_page
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_and_release_request(struct nfs_page *req)
{
@@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
* @hdr: The pageio hdr
* @count: Number of bytes to read
- * @offset: Initial offset
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
@@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
/**
* nfs_pgio_error - Clean up from a pageio error
- * @desc: IO descriptor
* @hdr: pageio header
*/
static void nfs_pgio_error(struct nfs_pgio_header *hdr)
@@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
+ nfs_list_move_request(req, &hdr->pages);
if (!last_page || last_page != req->wb_page) {
pageused++;
@@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* nfs_can_coalesce_requests - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
*
* The nfs_page structures 'prev' and 'req' are compared to ensure that the
* page data area they describe is contiguous, and that their RPC
@@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
}
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &mirror->pg_list);
+ nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
return 1;
}
@@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
}
}
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ LIST_HEAD(head);
+
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
/**
* nfs_pageio_add_request - Attempt to coalesce a request into a page list.
* @desc: destination io descriptor
@@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- if (mirror->pg_recoalesce)
- return 0;
+ if (desc->pg_error < 0 || mirror->pg_recoalesce)
+ goto out_cleanup_subreq;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
@@ -1061,6 +1066,10 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
+out_cleanup_subreq:
+ if (req != subreq)
+ nfs_pageio_cleanup_request(desc, subreq);
+ return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
struct nfs_page *req;
req = list_first_entry(&head, struct nfs_page, wb_list);
- nfs_list_remove_request(req);
if (__nfs_pageio_add_request(desc, req))
continue;
if (desc->pg_error < 0) {
@@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
}
}
@@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- goto out_failed;
+ goto out_cleanup_subreq;
}
return 1;
+out_cleanup_subreq:
+ if (req != dupreq)
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
- if (!mirror->pg_recoalesce)
+ if (desc->pg_error < 0 || !mirror->pg_recoalesce)
break;
if (!nfs_do_recoalesce(desc))
break;
@@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- nfs_list_remove_request(req);
if (!nfs_pageio_add_request(desc, req))
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
}
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 53726da5c010..8247bd1634cb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -758,22 +758,35 @@ static int
pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
struct nfs_server *server,
struct list_head *layout_list)
+ __must_hold(&clp->cl_lock)
+ __must_hold(RCU)
{
struct pnfs_layout_hdr *lo, *next;
struct inode *inode;
list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
- if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+ !list_empty(&lo->plh_bulk_destroy))
continue;
+ /* If the sb is being destroyed, just bail */
+ if (!nfs_sb_active(server->super))
+ break;
inode = igrab(lo->plh_inode);
- if (inode == NULL)
- continue;
- list_del_init(&lo->plh_layouts);
- if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
- continue;
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
+ if (inode != NULL) {
+ list_del_init(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode,
+ layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ } else {
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ }
+ nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
rcu_read_lock();
return -EAGAIN;
@@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
/* Free all lsegs that are attached to commit buckets */
nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
- iput(inode);
+ nfs_iput_and_deactive(inode);
}
return ret;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5e80a07b7bea..c0420b979d88 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -104,6 +104,7 @@ enum {
NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
};
enum layoutdriver_policy_flags {
@@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 7fb59487ee90..537b80d693f1 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
{
node->timestamp_unavailable = jiffies;
+ smp_mb__before_atomic();
set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
@@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
if (time_in_range(node->timestamp_unavailable, start, end))
return true;
clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
return false;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f9f19784db82..1d95a60b2586 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
}
static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0570391eaa16..23790c7b2289 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name,
/* kill possible hostname list: not supported */
comma = strchr(dev_name, ',');
if (comma != NULL && comma < end)
- *comma = 0;
+ len = comma - dev_name;
}
if (len > maxnamlen)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 79b97b3c4427..52d533967485 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data)
/**
* nfs_async_unlink_done - Sillydelete post-processing
* @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
*
* Do the directory attribute update.
*/
@@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
/**
* nfs_async_unlink_release - Release the sillydelete data.
- * @task: rpc_task of the sillydelete
+ * @calldata: struct nfs_unlinkdata to release
*
* We need to call nfs_put_unlinkdata as a 'tk_release' task since the
* rpc_task would be freed too.
@@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
/**
* nfs_async_unlink - asynchronous unlinking of a file
- * @dir: parent directory of dentry
- * @dentry: dentry to unlink
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
*/
static int
nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
@@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = {
* @new_dir: target directory for the rename
* @old_dentry: original dentry to be renamed
* @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
*
* It's expected that valid references to the dentries and inodes are held
*/
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d09c9f878141..f3ebabaa291d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -26,6 +26,7 @@
#include <linux/iversion.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
#include "delegation.h"
#include "internal.h"
@@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
- struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS);
+ struct nfs_io_completion *ioc;
+ unsigned int pflags = memalloc_nofs_save();
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+ ioc = nfs_io_completion_alloc(GFP_NOFS);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
@@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_pageio_complete(&pgio);
nfs_io_completion_put(ioc);
+ memalloc_nofs_restore(pflags);
+
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
- * @dst: commit list head
* @cinfo: holds list lock and accounting info
*
* This sets the PG_CLEAN bit, updates the cinfo count of
@@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req)
nfs_release_request(req);
}
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
{
struct nfs_page *req;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
+ if (nfs_error_is_fatal(error)) {
+ nfs_context_set_write_error(req->wb_context, error);
+ if (nfs_error_is_fatal_on_server(error)) {
+ nfs_write_error_remove_page(req);
+ continue;
+ }
+ }
nfs_redirty_request(req);
}
}
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
- nfs_async_write_error(&hdr->pages);
+ nfs_async_write_error(&hdr->pages, 0);
filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
hdr->args.offset + hdr->args.count - 1);
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9eb8086ea841..8f933e84cec1 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
&resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
resp->count = resp->buffer - argp->buffer;
- if (resp->offset)
- xdr_encode_hyper(resp->offset, argp->cookie);
+ if (resp->offset) {
+ loff_t offset = argp->cookie;
+
+ if (unlikely(resp->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *resp->offset = htonl(offset >> 32);
+ *resp->offset1 = htonl(offset & 0xffffffff);
+ resp->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(resp->offset, offset);
+ }
+ resp->offset = NULL;
+ }
RETURN_STATUS(nfserr);
}
@@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
} else {
xdr_encode_hyper(resp->offset, offset);
}
+ resp->offset = NULL;
}
RETURN_STATUS(nfserr);
@@ -576,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
resp->f_wtmax = max_blocksize;
resp->f_wtpref = max_blocksize;
resp->f_wtmult = PAGE_SIZE;
- resp->f_dtpref = PAGE_SIZE;
+ resp->f_dtpref = max_blocksize;
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9b973f4f7d01..93fea246f676 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -573,6 +573,8 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
p = decode_fh(p, &args->fh);
if (!p)
return 0;
@@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
- args->count = min_t(u32, args->count, PAGE_SIZE);
+ args->count = min_t(u32, args->count, max_blocksize);
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
@@ -921,6 +923,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
} else {
xdr_encode_hyper(cd->offset, offset64);
}
+ cd->offset = NULL;
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c74e4538d0eb..d219159b98af 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr {
int status;
};
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static __be32 *xdr_encode_empty_array(__be32 *p)
{
*p++ = xdr_zero;
@@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr,
*status = nfs_cb_stat_to_errno(be32_to_cpup(p));
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
out_unexpected:
dprintk("NFSD: Callback server returned operation %d but "
@@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
hdr->nops = be32_to_cpup(p);
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -437,7 +425,6 @@ out:
cb->cb_seq_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
return PTR_ERR(client);
}
cred = get_backchannel_cred(clp, client, ses);
- if (IS_ERR(cred)) {
+ if (!cred) {
rpc_shutdown_client(client);
- return PTR_ERR(cred);
+ return -ENOMEM;
}
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fb3c9844c82a..6a45fb00c5fc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1544,16 +1544,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
{
u32 slotsize = slot_bytes(ca);
u32 num = ca->maxreqs;
- int avail;
+ unsigned long avail, total_avail;
spin_lock(&nfsd_drc_lock);
- avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
- nfsd_drc_max_mem - nfsd_drc_mem_used);
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
/*
* Never use more than a third of the remaining memory,
* unless it's the only way to give this client a slot:
*/
- avail = clamp_t(int, avail, slotsize, avail/3);
+ avail = clamp_t(int, avail, slotsize, total_avail/3);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 72a7681f4046..f2feb2d11bae 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
case 'Y':
case 'y':
case '1':
- if (nn->nfsd_serv)
+ if (!nn->nfsd_serv)
return -EBUSY;
nfsd4_end_grace(nn);
break;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f2129a5d9f23..4391fd3abd8f 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -189,7 +189,7 @@ retry:
*/
if (!err)
return 0;
- else if (err != -EEXIST)
+ else if (err != -EBUSY)
goto failed_unlock;
err = invalidate_inode_pages2_range(btnc, newkey, newkey);
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce74ac0..735bfb2e9190 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -2,6 +2,7 @@ config FANOTIFY
bool "Filesystem wide access notification"
select FSNOTIFY
select ANON_INODES
+ select EXPORTFS
default n
---help---
Say Y here to enable fanotify support. fanotify is a file access
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3723f3d18d20..6b9c27548997 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,22 +13,40 @@
#include <linux/wait.h>
#include <linux/audit.h>
#include <linux/sched/mm.h>
+#include <linux/statfs.h>
#include "fanotify.h"
static bool should_merge(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
- struct fanotify_event_info *old, *new;
+ struct fanotify_event *old, *new;
pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
- old->path.mnt == new->path.mnt &&
- old->path.dentry == new->path.dentry)
- return true;
+ if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+ old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+ return false;
+
+ if (fanotify_event_has_path(old)) {
+ return old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry;
+ } else if (fanotify_event_has_fid(old)) {
+ /*
+ * We want to merge many dirent events in the same dir (i.e.
+ * creates/unlinks/renames), but we do not want to merge dirent
+ * events referring to subdirs with dirent events referring to
+ * non subdirs, otherwise, user won't be able to tell from a
+ * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+ * unlink pair or rmdir+create pair of events.
+ */
+ return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+ fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+ }
+
+ /* Do not merge events if we failed to encode fid */
return false;
}
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
{
struct fsnotify_event *test_event;
+ struct fanotify_event *new;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+ new = FANOTIFY_E(event);
/*
* Don't merge a permission event with any other event so that we know
* the event structure we have created in fanotify_handle_event() is the
* one we should check for permission response.
*/
- if (fanotify_is_perm_event(event->mask))
+ if (fanotify_is_perm_event(new->mask))
return 0;
list_for_each_entry_reverse(test_event, list, list) {
if (should_merge(test_event, event)) {
- test_event->mask |= event->mask;
+ FANOTIFY_E(test_event)->mask |= new->mask;
return 1;
}
}
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
return 0;
}
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event,
+ struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response);
+ ret = wait_event_killable(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED);
+ /* Signal pending? */
+ if (ret < 0) {
+ spin_lock(&group->notification_lock);
+ /* Event reported to userspace and no answer yet? */
+ if (event->state == FAN_EVENT_REPORTED) {
+ /* Event will get freed once userspace answers to it */
+ event->state = FAN_EVENT_CANCELED;
+ spin_unlock(&group->notification_lock);
+ return ret;
+ }
+ /* Event not yet reported? Just remove it. */
+ if (event->state == FAN_EVENT_INIT)
+ fsnotify_remove_queued_event(group, &event->fae.fse);
+ /*
+ * Event may be also answered in case signal delivery raced
+ * with wakeup. In that case we have nothing to do besides
+ * freeing the event and reporting error.
+ */
+ spin_unlock(&group->notification_lock);
+ goto out;
+ }
/* userspace responded, convert to something usable */
switch (event->response & ~FAN_AUDIT) {
@@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
if (event->response & FAN_AUDIT)
audit_fanotify(event->response & ~FAN_AUDIT);
- event->response = 0;
-
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
-
+out:
+ fsnotify_destroy_event(group, &event->fae.fse);
+
return ret;
}
@@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group,
* been included within the event mask, but have not been explicitly
* requested by the user, will not be present in the returned mask.
*/
-static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
- u32 event_mask, const void *data,
- int data_type)
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ struct fsnotify_iter_info *iter_info,
+ u32 event_mask, const void *data,
+ int data_type)
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
const struct path *path = data;
struct fsnotify_mark *mark;
int type;
@@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- /* If we don't have enough info to send an event to userspace say no */
- if (data_type != FSNOTIFY_EVENT_PATH)
- return 0;
-
- /* Sorry, fanotify only gives a damn about files and dirs */
- if (!d_is_reg(path->dentry) &&
- !d_can_lookup(path->dentry))
- return 0;
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do we have path to open a file descriptor? */
+ if (data_type != FSNOTIFY_EVENT_PATH)
+ return 0;
+ /* Path type events are only relevant for files and dirs */
+ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+ return 0;
+ }
fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
marks_ignored_mask |= mark->ignored_mask;
}
- if (d_is_dir(path->dentry) &&
+ test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+ /*
+ * dirent modification events (create/delete/move) do not carry the
+ * child entry name/inode information. Instead, we report FAN_ONDIR
+ * for mkdir/rmdir so user can differentiate them from creat/unlink.
+ *
+ * For backward compatibility and consistency, do not report FAN_ONDIR
+ * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+ * to user in FAN_REPORT_FID mode for all event types.
+ */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do not report FAN_ONDIR without any event */
+ if (!(test_mask & ~FAN_ONDIR))
+ return 0;
+ } else {
+ user_mask &= ~FAN_ONDIR;
+ }
+
+ if (event_mask & FS_ISDIR &&
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return 0;
- return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask &
- ~marks_ignored_mask;
+ return test_mask & user_mask;
+}
+
+static int fanotify_encode_fid(struct fanotify_event *event,
+ struct inode *inode, gfp_t gfp,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_fid *fid = &event->fid;
+ int dwords, bytes = 0;
+ int err, type;
+
+ fid->ext_fh = NULL;
+ dwords = 0;
+ err = -ENOENT;
+ type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ if (!dwords)
+ goto out_err;
+
+ bytes = dwords << 2;
+ if (bytes > FANOTIFY_INLINE_FH_LEN) {
+ /* Treat failure to allocate fh as failure to allocate event */
+ err = -ENOMEM;
+ fid->ext_fh = kmalloc(bytes, gfp);
+ if (!fid->ext_fh)
+ goto out_err;
+ }
+
+ type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+ &dwords, NULL);
+ err = -EINVAL;
+ if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+ goto out_err;
+
+ fid->fsid = *fsid;
+ event->fh_len = bytes;
+
+ return type;
+
+out_err:
+ pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+ "type=%d, bytes=%d, err=%i)\n",
+ fsid->val[0], fsid->val[1], type, bytes, err);
+ kfree(fid->ext_fh);
+ fid->ext_fh = NULL;
+ event->fh_len = 0;
+
+ return FILEID_INVALID;
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path)
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+ const void *data, int data_type)
{
- struct fanotify_event_info *event = NULL;
+ if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return to_tell;
+ else if (data_type == FSNOTIFY_EVENT_INODE)
+ return (struct inode *)data;
+ else if (data_type == FSNOTIFY_EVENT_PATH)
+ return d_inode(((struct path *)data)->dentry);
+ return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_event *event = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
/*
* For queues with unlimited length lost events are not expected and
@@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
memalloc_use_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
- struct fanotify_perm_event_info *pevent;
+ struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
goto out;
event = &pevent->fae;
pevent->response = 0;
+ pevent->state = FAN_EVENT_INIT;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
goto out;
init: __maybe_unused
- fsnotify_init_event(&event->fse, inode, mask);
+ fsnotify_init_event(&event->fse, inode);
+ event->mask = mask;
if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
event->pid = get_pid(task_pid(current));
else
event->pid = get_pid(task_tgid(current));
- if (path) {
- event->path = *path;
+ event->fh_len = 0;
+ if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Report the event without a file identifier on encode error */
+ event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+ } else if (data_type == FSNOTIFY_EVENT_PATH) {
+ event->fh_type = FILEID_ROOT;
+ event->path = *((struct path *)data);
path_get(&event->path);
} else {
+ event->fh_type = FILEID_INVALID;
event->path.mnt = NULL;
event->path.dentry = NULL;
}
@@ -190,6 +335,29 @@ out:
return event;
}
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+ int type;
+ __kernel_fsid_t fsid = {};
+
+ fsnotify_foreach_obj_type(type) {
+ if (!fsnotify_iter_should_report_type(iter_info, type))
+ continue;
+
+ fsid = iter_info->marks[type]->connector->fsid;
+ if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ continue;
+ return fsid;
+ }
+
+ return fsid;
+}
+
static int fanotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
u32 mask, const void *data, int data_type,
@@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_iter_info *iter_info)
{
int ret = 0;
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
struct fsnotify_event *fsn_event;
+ __kernel_fsid_t fsid = {};
BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+ BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+ BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+ BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+ BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+ BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
- mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
+ mask = fanotify_group_event_mask(group, iter_info, mask, data,
+ data_type);
if (!mask)
return 0;
@@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- event = fanotify_alloc_event(group, inode, mask, data);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ fsid = fanotify_get_fsid(iter_info);
+
+ event = fanotify_alloc_event(group, inode, mask, data, data_type,
+ &fsid);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
} else if (fanotify_is_perm_event(mask)) {
ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
iter_info);
- fsnotify_destroy_event(group, fsn_event);
}
finish:
if (fanotify_is_perm_event(mask))
@@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
static void fanotify_free_event(struct fsnotify_event *fsn_event)
{
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
event = FANOTIFY_E(fsn_event);
- path_put(&event->path);
+ if (fanotify_event_has_path(event))
+ path_put(&event->path);
+ else if (fanotify_event_has_ext_fh(event))
+ kfree(event->fid.ext_fh);
put_pid(event->pid);
- if (fanotify_is_perm_event(fsn_event->mask)) {
+ if (fanotify_is_perm_event(event->mask)) {
kmem_cache_free(fanotify_perm_event_cachep,
FANOTIFY_PE(fsn_event));
return;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index ea05b8a401e7..68b30504284c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,26 +2,112 @@
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
+#include <linux/exportfs.h>
extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+/* Possible states of the permission event */
+enum {
+ FAN_EVENT_INIT,
+ FAN_EVENT_REPORTED,
+ FAN_EVENT_ANSWERED,
+ FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN (3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN (4 << 2)
+#endif
+
+struct fanotify_fid {
+ __kernel_fsid_t fsid;
+ union {
+ unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+ unsigned char *ext_fh;
+ };
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+ unsigned int fh_len)
+{
+ return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+ struct fanotify_fid *fid2,
+ unsigned int fh_len)
+{
+ return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+ fid1->fsid.val[1] == fid2->fsid.val[1] &&
+ !memcmp(fanotify_fid_fh(fid1, fh_len),
+ fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
/*
* Structure for normal fanotify events. It gets allocated in
* fanotify_handle_event() and freed when the information is retrieved by
* userspace
*/
-struct fanotify_event_info {
+struct fanotify_event {
struct fsnotify_event fse;
+ u32 mask;
/*
- * We hold ref to this path so it may be dereferenced at any point
- * during this object's lifetime
+ * Those fields are outside fanotify_fid to pack fanotify_event nicely
+ * on 64bit arch and to use fh_type as an indication of whether path
+ * or fid are used in the union:
+ * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
*/
- struct path path;
+ u8 fh_type;
+ u8 fh_len;
+ u16 pad;
+ union {
+ /*
+ * We hold ref to this path so it may be dereferenced at any
+ * point during this object's lifetime
+ */
+ struct path path;
+ /*
+ * With FAN_REPORT_FID, we do not hold any reference on the
+ * victim object. Instead we store its NFS file handle and its
+ * filesystem's fsid as a unique identifier.
+ */
+ struct fanotify_fid fid;
+ };
struct pid *pid;
};
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+ return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+ return event->fh_type != FILEID_ROOT &&
+ event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+ return fanotify_event_has_fid(event) &&
+ event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+ return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -29,16 +115,17 @@ struct fanotify_event_info {
* group->notification_list to group->fanotify_data.access_list to wait for
* user response.
*/
-struct fanotify_perm_event_info {
- struct fanotify_event_info fae;
- int response; /* userspace answer to question */
+struct fanotify_perm_event {
+ struct fanotify_event fae;
+ unsigned short response; /* userspace answer to the event */
+ unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
};
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
FANOTIFY_PE(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+ return container_of(fse, struct fanotify_perm_event, fae.fse);
}
static inline bool fanotify_is_perm_event(u32 mask)
@@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask)
mask & FANOTIFY_PERM_EVENTS;
}
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_event_info, fse);
+ return container_of(fse, struct fanotify_event, fse);
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9c870b0d2b56..56992b32c6bb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
#include <asm/ioctls.h>
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+ if (!fanotify_event_has_fid(event))
+ return 0;
+
+ return roundup(sizeof(struct fanotify_event_info_fid) +
+ sizeof(struct file_handle) + event->fh_len,
+ FANOTIFY_EVENT_ALIGN);
+}
+
/*
* Get an fsnotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- assert_spin_locked(&group->notification_lock);
+ size_t event_size = FAN_EVENT_METADATA_LEN;
+ struct fsnotify_event *fsn_event = NULL;
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+ spin_lock(&group->notification_lock);
if (fsnotify_notify_queue_is_empty(group))
- return NULL;
+ goto out;
- if (FAN_EVENT_METADATA_LEN > count)
- return ERR_PTR(-EINVAL);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ event_size += fanotify_event_info_len(
+ FANOTIFY_E(fsnotify_peek_first_event(group)));
+ }
- /* held the notification_lock the whole time, so this is the
- * same event we peeked above */
- return fsnotify_remove_first_event(group);
+ if (event_size > count) {
+ fsn_event = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ fsn_event = fsnotify_remove_first_event(group);
+ if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+ FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+ spin_unlock(&group->notification_lock);
+ return fsn_event;
}
static int create_fd(struct fsnotify_group *group,
- struct fanotify_event_info *event,
+ struct fanotify_event *event,
struct file **file)
{
int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
return client_fd;
}
-static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *fsn_event,
- struct file **file)
-{
- int ret = 0;
- struct fanotify_event_info *event;
-
- pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, fsn_event);
-
- *file = NULL;
- event = container_of(fsn_event, struct fanotify_event_info, fse);
- metadata->event_len = FAN_EVENT_METADATA_LEN;
- metadata->metadata_len = FAN_EVENT_METADATA_LEN;
- metadata->vers = FANOTIFY_METADATA_VERSION;
- metadata->reserved = 0;
- metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
- metadata->pid = pid_vnr(event->pid);
- if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
- metadata->fd = FAN_NOFD;
- else {
- metadata->fd = create_fd(group, event, file);
- if (metadata->fd < 0)
- ret = metadata->fd;
- }
-
- return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
- struct fsnotify_group *group, int fd)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+ struct fanotify_perm_event *event,
+ unsigned int response)
+ __releases(&group->notification_lock)
{
- struct fanotify_perm_event_info *event, *return_e = NULL;
-
- spin_lock(&group->notification_lock);
- list_for_each_entry(event, &group->fanotify_data.access_list,
- fae.fse.list) {
- if (event->fd != fd)
- continue;
+ bool destroy = false;
- list_del_init(&event->fae.fse.list);
- return_e = event;
- break;
- }
+ assert_spin_locked(&group->notification_lock);
+ event->response = response;
+ if (event->state == FAN_EVENT_CANCELED)
+ destroy = true;
+ else
+ event->state = FAN_EVENT_ANSWERED;
spin_unlock(&group->notification_lock);
-
- pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
- return return_e;
+ if (destroy)
+ fsnotify_destroy_event(group, &event->fae.fse);
}
static int process_access_response(struct fsnotify_group *group,
struct fanotify_response *response_struct)
{
- struct fanotify_perm_event_info *event;
+ struct fanotify_perm_event *event;
int fd = response_struct->fd;
int response = response_struct->response;
@@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group,
if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
return -EINVAL;
- event = dequeue_event(group, fd);
- if (!event)
- return -ENOENT;
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (event->fd != fd)
+ continue;
- event->response = response;
- wake_up(&group->fanotify_data.access_waitq);
+ list_del_init(&event->fae.fse.list);
+ finish_permission_event(group, event, response);
+ wake_up(&group->fanotify_data.access_waitq);
+ return 0;
+ }
+ spin_unlock(&group->notification_lock);
+
+ return -ENOENT;
+}
+
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+ struct fanotify_event_info_fid info = { };
+ struct file_handle handle = { };
+ size_t fh_len = event->fh_len;
+ size_t len = fanotify_event_info_len(event);
+
+ if (!len)
+ return 0;
+
+ if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+ return -EFAULT;
+
+ /* Copy event info fid header followed by vaiable sized file handle */
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+ info.hdr.len = len;
+ info.fsid = event->fid.fsid;
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ buf += sizeof(info);
+ len -= sizeof(info);
+ handle.handle_type = event->fh_type;
+ handle.handle_bytes = fh_len;
+ if (copy_to_user(buf, &handle, sizeof(handle)))
+ return -EFAULT;
+
+ buf += sizeof(handle);
+ len -= sizeof(handle);
+ if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ return -EFAULT;
+
+ /* Pad with 0's */
+ buf += fh_len;
+ len -= fh_len;
+ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+ if (len > 0 && clear_user(buf, len))
+ return -EFAULT;
return 0;
}
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
+ struct fsnotify_event *fsn_event,
char __user *buf, size_t count)
{
- struct fanotify_event_metadata fanotify_event_metadata;
- struct file *f;
- int fd, ret;
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
- if (ret < 0)
- return ret;
+ struct fanotify_event_metadata metadata;
+ struct fanotify_event *event;
+ struct file *f = NULL;
+ int ret, fd = FAN_NOFD;
+
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+ event = container_of(fsn_event, struct fanotify_event, fse);
+ metadata.event_len = FAN_EVENT_METADATA_LEN;
+ metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+ metadata.vers = FANOTIFY_METADATA_VERSION;
+ metadata.reserved = 0;
+ metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+ metadata.pid = pid_vnr(event->pid);
+
+ if (fanotify_event_has_path(event)) {
+ fd = create_fd(group, event, &f);
+ if (fd < 0)
+ return fd;
+ } else if (fanotify_event_has_fid(event)) {
+ metadata.event_len += fanotify_event_info_len(event);
+ }
+ metadata.fd = fd;
- fd = fanotify_event_metadata.fd;
ret = -EFAULT;
/*
* Sanity check copy size in case get_one_event() and
* fill_event_metadata() event_len sizes ever get out of sync.
*/
- if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count))
+ if (WARN_ON_ONCE(metadata.event_len > count))
goto out_close_fd;
- if (copy_to_user(buf, &fanotify_event_metadata,
- fanotify_event_metadata.event_len))
+
+ if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
goto out_close_fd;
if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PE(event)->fd = fd;
+ FANOTIFY_PE(fsn_event)->fd = fd;
- if (fd != FAN_NOFD)
+ if (fanotify_event_has_path(event)) {
fd_install(fd, f);
- return fanotify_event_metadata.event_len;
+ } else if (fanotify_event_has_fid(event)) {
+ ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+ if (ret < 0)
+ return ret;
+ }
+
+ return metadata.event_len;
out_close_fd:
if (fd != FAN_NOFD) {
@@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- spin_unlock(&group->notification_lock);
-
if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
break;
@@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
- if (!fanotify_is_perm_event(kevent->mask)) {
+ if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
fsnotify_destroy_event(group, kevent);
} else {
if (ret <= 0) {
- FANOTIFY_PE(kevent)->response = FAN_DENY;
+ spin_lock(&group->notification_lock);
+ finish_permission_event(group,
+ FANOTIFY_PE(kevent), FAN_DENY);
wake_up(&group->fanotify_data.access_waitq);
} else {
spin_lock(&group->notification_lock);
@@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
- struct fanotify_perm_event_info *event, *next;
+ struct fanotify_perm_event *event;
struct fsnotify_event *fsn_event;
/*
@@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* and simulate reply from userspace.
*/
spin_lock(&group->notification_lock);
- list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
- fae.fse.list) {
- pr_debug("%s: found group=%p event=%p\n", __func__, group,
- event);
-
+ while (!list_empty(&group->fanotify_data.access_list)) {
+ event = list_first_entry(&group->fanotify_data.access_list,
+ struct fanotify_perm_event, fae.fse.list);
list_del_init(&event->fae.fse.list);
- event->response = FAN_ALLOW;
+ finish_permission_event(group, event, FAN_ALLOW);
+ spin_lock(&group->notification_lock);
}
/*
@@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) {
+ if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
- spin_lock(&group->notification_lock);
} else {
- FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+ finish_permission_event(group, FANOTIFY_PE(fsn_event),
+ FAN_ALLOW);
}
+ spin_lock(&group->notification_lock);
}
spin_unlock(&group->notification_lock);
@@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *mark;
int ret;
@@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
return ERR_PTR(-ENOMEM);
fsnotify_init_mark(mark, group);
- ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+ ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
- __u32 mask, unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
@@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, type);
+ fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
if (IS_ERR(fsn_mark)) {
mutex_unlock(&group->mark_mutex);
return PTR_ERR(fsn_mark);
@@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
}
static int fanotify_add_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags)
+ struct super_block *sb, __u32 mask,
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &sb->s_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_SB, mask, flags);
+ FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
return 0;
return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+ FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
}
/* fanotify syscalls */
@@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct fsnotify_group *group;
int f_flags, fd;
struct user_struct *user;
- struct fanotify_event_info *oevent;
+ struct fanotify_event *oevent;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
return -EINVAL;
}
+ if ((flags & FAN_REPORT_FID) &&
+ (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+ return -EINVAL;
+
user = get_current_user();
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
free_uid(user);
@@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
- oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+ oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+ FSNOTIFY_EVENT_NONE, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -801,6 +868,48 @@ out_destroy_group:
return fd;
}
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+ __kernel_fsid_t root_fsid;
+ int err;
+
+ /*
+ * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ */
+ err = vfs_get_fsid(path->dentry, fsid);
+ if (err)
+ return err;
+
+ if (!fsid->val[0] && !fsid->val[1])
+ return -ENODEV;
+
+ /*
+ * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * which uses a different fsid than sb root.
+ */
+ err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ if (err)
+ return err;
+
+ if (root_fsid.val[0] != fsid->val[0] ||
+ root_fsid.val[1] != fsid->val[1])
+ return -EXDEV;
+
+ /*
+ * We need to make sure that the file system supports at least
+ * encoding a file handle so user can use name_to_handle_at() to
+ * compare fid returned with event to the file handle of watched
+ * objects. However, name_to_handle_at() requires that the
+ * filesystem also supports decoding file handles.
+ */
+ if (!path->dentry->d_sb->s_export_op ||
+ !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
@@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
+ __kernel_fsid_t __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
int ret;
@@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ /*
+ * Events with data type inode do not carry enough information to report
+ * event->fd, so we do not allow setting a mask for inode events unless
+ * group supports reporting fid.
+ * inode events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount point.
+ */
+ if (mask & FANOTIFY_INODE_EVENTS &&
+ (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+ mark_type == FAN_MARK_MOUNT))
+ goto fput_and_out;
+
if (flags & FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
@@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ ret = fanotify_test_fid(&path, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ fsid = &__fsid;
+ }
+
/* inode held in place by reference to path; group by fget on fd */
if (mark_type == FAN_MARK_INODE)
inode = path.dentry->d_inode;
@@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+ flags, fsid);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+ flags, fsid);
else
- ret = fanotify_add_inode_mark(group, inode, mask, flags);
+ ret = fanotify_add_inode_mark(group, inode, mask,
+ flags, fsid);
break;
case FAN_MARK_REMOVE:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+ flags);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+ flags);
else
- ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+ ret = fanotify_remove_inode_mark(group, inode, mask,
+ flags);
break;
default:
ret = -EINVAL;
}
+path_put_and_out:
path_put(&path);
fput_and_out:
fdput(f);
@@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
- fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
- KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+ KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ecf09b6243d9..df06f3da166c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb = NULL;
+ struct super_block *sb = to_tell->i_sb;
struct mount *mnt = NULL;
- __u32 mnt_or_sb_mask = 0;
+ __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
int ret = 0;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- sb = mnt->mnt.mnt_sb;
- mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
+ mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
}
/* An event "on child" is not intended for a mount/sb mark */
if (mask & FS_EVENT_ON_CHILD)
@@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!to_tell->i_fsnotify_marks &&
- (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks)))
+ if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
@@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
- iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
- fsnotify_first_mark(&sb->s_fsnotify_marks);
}
/*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d35b61..74ae60305189 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -5,6 +5,7 @@
struct inotify_event_info {
struct fsnotify_event fse;
+ u32 mask;
int wd;
u32 sync_cookie;
int name_len;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4f3815..ff30abd6a49b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
{
struct inotify_event_info *old, *new;
- if (old_fsn->mask & FS_IN_IGNORED)
- return false;
old = INOTIFY_E(old_fsn);
new = INOTIFY_E(new_fsn);
- if ((old_fsn->mask == new_fsn->mask) &&
+ if (old->mask & FS_IN_IGNORED)
+ return false;
+ if ((old->mask == new->mask) &&
(old_fsn->inode == new_fsn->inode) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
return -ENOMEM;
}
+ /*
+ * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+ * for fanotify. inotify never reported IN_ISDIR with those events.
+ * It looks like an oversight, but to avoid the risk of breaking
+ * existing inotify programs, mask the flag out from those events.
+ */
+ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+ mask &= ~IN_ISDIR;
+
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode, mask);
+ fsnotify_init_event(fsn_event, inode);
+ event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
event->name_len = len;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 798f1253141a..e2901fbb9f76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
*/
pad_name_len = round_event_name_len(fsn_event);
inotify_event.len = pad_name_len;
- inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.mask = inotify_mask_to_arg(event->mask);
inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+ fsnotify_init_event(group->overflow_event, NULL);
+ oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
oevent->name_len = 0;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d2dd16cb5989..d593d4269561 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -82,6 +82,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
+#include <linux/ratelimit.h>
#include <linux/atomic.h>
@@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
@@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
INIT_HLIST_HEAD(&conn->list);
conn->type = type;
conn->obj = connp;
+ /* Cache fsid of filesystem containing the object */
+ if (fsid)
+ conn->fsid = *fsid;
+ else
+ conn->fsid.val[0] = conn->fsid.val[1] = 0;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
inode = igrab(fsnotify_conn_inode(conn));
/*
@@ -544,7 +551,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(type)))
return -EINVAL;
+
+ /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+ if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+ return -ENODEV;
+
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, type);
+ err = fsnotify_attach_connector_to_object(connp, type, fsid);
if (err)
return err;
goto restart;
+ } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
+ (fsid->val[0] != conn->fsid.val[0] ||
+ fsid->val[1] != conn->fsid.val[1])) {
+ /*
+ * Backend is expected to check for non uniform fsid
+ * (e.g. btrfs), but maybe we missed something?
+ * Only allow setting conn->fsid once to non zero fsid.
+ * inotify and non-fid fanotify groups do not set nor test
+ * conn->fsid.
+ */
+ pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+ "%x.%x != %x.%x\n", __func__, conn->type,
+ fsid->val[0], fsid->val[1],
+ conn->fsid.val[0], conn->fsid.val[1]);
+ err = -EXDEV;
+ goto out_err;
}
/* is mark the first mark? */
@@ -606,7 +634,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
if (ret)
goto err;
@@ -648,13 +676,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int type, int allow_dups)
+ unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e36745f59..5f3a54d444b5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
/* Overflow events are per-group and we don't want to free them */
- if (!event || event->mask == FS_Q_OVERFLOW)
+ if (!event || event == group->overflow_event)
return;
/*
* If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
return ret;
}
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+{
+ assert_spin_locked(&group->notification_lock);
+ /*
+ * We need to init list head for the case of overflow event so that
+ * check in fsnotify_add_event() works
+ */
+ list_del_init(&event->list);
+ group->q_len--;
+}
+
/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
event = list_first_entry(&group->notification_list,
struct fsnotify_event, list);
- /*
- * We need to init list head for the case of overflow event so that
- * check in fsnotify_add_event() works
- */
- list_del_init(&event->list);
- group->q_len--;
-
+ fsnotify_remove_queued_event(group, event);
return event;
}
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
}
spin_unlock(&group->notification_lock);
}
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- * parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
- u32 mask)
-{
- INIT_LIST_HEAD(&event->list);
- event->inode = inode;
- event->mask = mask;
-}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
+out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe76568f..b094d3d79354 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
loff_t pos = iocb->ki_pos;
ssize_t rc = 0;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
orangefs_stats.reads++;
@@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
loff_t pos;
ssize_t rc;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
inode_lock(file->f_mapping->host);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index f038235c64bd..c3334eca18c7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
/* override block size reported to stat */
- if (request_mask & STATX_SIZE)
- stat->result_mask = STATX_BASIC_STATS;
- else
- stat->result_mask = STATX_BASIC_STATS &
- ~STATX_SIZE;
+ if (!(request_mask & STATX_SIZE))
+ stat->result_mask &= ~STATX_SIZE;
stat->attributes_mask = STATX_ATTR_IMMUTABLE |
STATX_ATTR_APPEND;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9e62dcf06fc4..68b3303e4b46 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
int err;
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ if (S_ISREG(c->stat.mode) && !c->metacopy) {
+ struct path upperpath, datapath;
+
+ ovl_path_upper(c->dentry, &upperpath);
+ if (WARN_ON(upperpath.dentry != NULL))
+ return -EIO;
+ upperpath.dentry = temp;
+
+ ovl_path_lowerdata(c->dentry, &datapath);
+ err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+ if (err)
+ return err;
+ }
+
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
if (err)
return err;
@@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
- if (S_ISREG(c->stat.mode) && !c->metacopy) {
- struct path upperpath, datapath;
-
- ovl_path_upper(c->dentry, &upperpath);
- BUG_ON(upperpath.dentry != NULL);
- upperpath.dentry = temp;
-
- ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
- if (err)
- return err;
- }
-
if (c->metacopy) {
err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
NULL, 0, -EOPNOTSUPP);
@@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
struct path upperpath, datapath;
int err;
+ char *capability = NULL;
+ ssize_t uninitialized_var(cap_size);
ovl_path_upper(c->dentry, &upperpath);
if (WARN_ON(upperpath.dentry == NULL))
@@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (WARN_ON(datapath.dentry == NULL))
return -EIO;
+ if (c->stat.size) {
+ err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ &capability, 0);
+ if (err < 0 && err != -ENODATA)
+ goto out;
+ }
+
err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
if (err)
- return err;
+ goto out_free;
+
+ /*
+ * Writing to upper file will clear security.capability xattr. We
+ * don't want that to happen for normal copy-up operation.
+ */
+ if (capability) {
+ err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
+ if (err)
+ goto out_free;
+ }
+
err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
- return err;
+ goto out_free;
ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+ kfree(capability);
+out:
return err;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5e45cb3630a0..9c6018287d57 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
int ovl_check_metacopy_xattr(struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding);
static inline bool ovl_is_impuredir(struct dentry *dentry)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7c01327b1852..4035e640f402 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding)
{
- int res;
- char *s, *next, *buf = NULL;
+ ssize_t res;
+ char *buf = NULL;
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ res = vfs_getxattr(dentry, name, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
- return NULL;
+ return -ENODATA;
goto fail;
}
- buf = kzalloc(res + padding + 1, GFP_KERNEL);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (res != 0) {
+ buf = kzalloc(res + padding, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- if (res == 0)
- goto invalid;
+ res = vfs_getxattr(dentry, name, buf, res);
+ if (res < 0)
+ goto fail;
+ }
+ *value = buf;
+
+ return res;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+ name, res);
+ kfree(buf);
+ return res;
+}
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+{
+ int res;
+ char *s, *next, *buf = NULL;
+
+ res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
+ if (res == -ENODATA)
+ return NULL;
if (res < 0)
- goto fail;
+ return ERR_PTR(res);
if (res == 0)
goto invalid;
@@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
}
return buf;
-
-err_free:
- kfree(buf);
- return ERR_PTR(res);
-fail:
- pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
- goto err_free;
invalid:
pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
res = -EINVAL;
- goto err_free;
+ kfree(buf);
+ return ERR_PTR(res);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..070aad543382 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
@@ -226,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
}
EXPORT_SYMBOL(generic_pipe_buf_release);
+/* New data written to a pipe may be appended to a buffer with this type. */
static const struct pipe_buf_operations anon_pipe_buf_ops = {
- .can_merge = 1,
+ .confirm = generic_pipe_buf_confirm,
+ .release = anon_pipe_buf_release,
+ .steal = anon_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
@@ -235,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
};
static const struct pipe_buf_operations packet_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
+/**
+ * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
+ * @buf: the buffer to mark
+ *
+ * Description:
+ * This function ensures that no future writes will be merged into the
+ * given &struct pipe_buffer. This is necessary when multiple pipe buffers
+ * share the same backing page.
+ */
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+ if (buf->ops == &anon_pipe_buf_ops)
+ buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
+static bool pipe_buf_can_merge(struct pipe_buffer *buf)
+{
+ return buf->ops == &anon_pipe_buf_ops;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -379,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
struct pipe_buffer *buf = pipe->bufs + lastbuf;
int offset = buf->offset + buf->len;
- if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+ if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
diff --git a/fs/pnode.c b/fs/pnode.c
index 1100e810d855..7ea6cfb65077 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
}
/* all accesses are serialized by namespace_sem */
-static struct user_namespace *user_ns;
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
@@ -260,9 +259,6 @@ static int propagate_one(struct mount *m)
type |= CL_MAKE_SHARED;
}
- /* Notice when we are propagating across user namespaces */
- if (m->mnt_ns->user_ns != user_ns)
- type |= CL_UNPRIVILEGED;
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
@@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
* propagate_one(); everything is serialized by namespace_sem,
* so globals will do just fine.
*/
- user_ns = current->nsproxy->mnt_ns->user_ns;
last_dest = dest_mnt;
first_source = source_mnt;
last_source = source_mnt;
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65becd2..3960a83666cf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -27,8 +27,7 @@
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
#define CL_SHARED_TO_SLAVE 0x20
-#define CL_UNPRIVILEGED 0x40
-#define CL_COPY_MNT_NS_FILE 0x80
+#define CL_COPY_MNT_NS_FILE 0x40
#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9d428d5a0ac8..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..f5ebdd87afb2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
@@ -92,7 +93,6 @@
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
-#include <linux/flex_array.h>
#include <linux/posix-timers.h>
#include <trace/events/oom.h>
#include "internal.h"
@@ -140,9 +140,13 @@ struct pid_entry {
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show) \
- NOD(NAME, (S_IFREG|(MODE)), \
+ NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+#define ATTR(LSM, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsm = LSM })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -456,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -1086,10 +1090,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1210,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
@@ -2142,11 +2142,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
+ GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ genradix_init(&fa);
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2178,35 +2179,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > ctx->pos)
- nr_files++;
- }
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
up_read(&mm->mmap_sem);
mmput(mm);
goto out_put_task;
}
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= ctx->pos)
- continue;
- info.start = vma->vm_start;
- info.end = vma->vm_end;
- info.mode = vma->vm_file->f_mode;
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
}
up_read(&mm->mmap_sem);
mmput(mm);
@@ -2215,7 +2203,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
- p = flex_array_get(fa, i);
+ p = genradix_ptr(&fa, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
buf, len,
@@ -2225,12 +2213,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
break;
ctx->pos++;
}
- if (fa)
- flex_array_free(fa);
out_put_task:
put_task_struct(task);
out:
+ genradix_free(&fa);
return ret;
}
@@ -2459,11 +2446,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
static struct dentry *proc_pident_lookup(struct inode *dir,
struct dentry *dentry,
- const struct pid_entry *ents,
- unsigned int nents)
+ const struct pid_entry *p,
+ const struct pid_entry *end)
{
struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
@@ -2473,8 +2459,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents];
- for (p = ents; p < last; p++) {
+ for (; p < end; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len)) {
@@ -2525,7 +2510,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsm,
(char*)file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2574,7 +2559,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+ rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name, page,
+ count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
@@ -2588,13 +2575,53 @@ static const struct file_operations proc_pid_attr_operations = {
.llseek = generic_file_llseek,
};
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR("smack", "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("prev", S_IRUGO, proc_pid_attr_operations),
- REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ ATTR(NULL, "current", 0666),
+ ATTR(NULL, "prev", 0444),
+ ATTR(NULL, "exec", 0666),
+ ATTR(NULL, "fscreate", 0666),
+ ATTR(NULL, "keycreate", 0666),
+ ATTR(NULL, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2613,7 +2640,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+ attr_dir_stuff,
+ attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}
static const struct inode_operations proc_attr_dir_inode_operations = {
@@ -3002,7 +3030,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3049,7 +3077,8 @@ static const struct file_operations proc_tgid_base_operations = {
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ tgid_base_stuff,
+ tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}
static const struct inode_operations proc_tgid_base_inode_operations = {
@@ -3165,7 +3194,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
@@ -3390,7 +3419,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3421,7 +3450,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ tid_base_stuff,
+ tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}
static const struct file_operations proc_tid_base_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index da649ccd6804..fc7e38def174 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,6 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
-#include <linux/magic.h>
#include <linux/uaccess.h>
@@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-static const struct super_operations proc_sops = {
+const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
- .remount_fs = proc_remount,
.show_options = proc_show_options,
};
@@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
pde_put(de);
return inode;
}
-
-int proc_fill_super(struct super_block *s, void *data, int silent)
-{
- struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
- struct inode *root_inode;
- int ret;
-
- if (!proc_parse_options(data, ns))
- return -EINVAL;
-
- /* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
- s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = PROC_SUPER_MAGIC;
- s->s_op = &proc_sops;
- s->s_time_gran = 1;
-
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- /* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
-
- pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode) {
- pr_err("proc_fill_super: get root inode failed\n");
- return -ENOMEM;
- }
-
- s->s_root = d_make_root(root_inode);
- if (!s->s_root) {
- pr_err("proc_fill_super: allocate dentry failed\n");
- return -ENOMEM;
- }
-
- ret = proc_setup_self(s);
- if (ret) {
- return ret;
- }
- return proc_setup_thread_self(s);
-}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95b14196f284..d1671e97f7fe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -82,6 +82,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
+ const char *lsm;
};
struct proc_inode {
@@ -162,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
@@ -206,13 +207,12 @@ struct pde_opener {
struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
-
extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -270,10 +270,8 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
-extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..8b145e7b9661 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -19,86 +19,178 @@
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
#include "internal.h"
-enum {
- Opt_gid, Opt_hidepid, Opt_err,
+struct proc_fs_context {
+ struct pid_namespace *pid_ns;
+ unsigned int mask;
+ int hidepid;
+ int gid;
};
-static const match_table_t tokens = {
- {Opt_hidepid, "hidepid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_err, NULL},
+enum proc_param {
+ Opt_gid,
+ Opt_hidepid,
};
-int proc_parse_options(char *options, struct pid_namespace *pid)
+static const struct fs_parameter_spec proc_param_specs[] = {
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32("hidepid", Opt_hidepid),
+ {}
+};
+
+static const struct fs_parameter_description proc_fs_parameters = {
+ .name = "proc",
+ .specs = proc_param_specs,
+};
+
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- pid->pid_gid = make_kgid(current_user_ns(), option);
- break;
- case Opt_hidepid:
- if (match_int(&args[0], &option))
- return 0;
- if (option < HIDEPID_OFF ||
- option > HIDEPID_INVISIBLE) {
- pr_err("proc: hidepid value must be between 0 and 2.\n");
- return 0;
- }
- pid->hide_pid = option;
- break;
- default:
- pr_err("proc: unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_gid:
+ ctx->gid = result.uint_32;
+ break;
+
+ case Opt_hidepid:
+ ctx->hidepid = result.uint_32;
+ if (ctx->hidepid < HIDEPID_OFF ||
+ ctx->hidepid > HIDEPID_INVISIBLE)
+ return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+ break;
+
+ default:
+ return -EINVAL;
}
- return 1;
+ ctx->mask |= 1 << opt;
+ return 0;
}
-int proc_remount(struct super_block *sb, int *flags, char *data)
+static void proc_apply_options(struct super_block *s,
+ struct fs_context *fc,
+ struct pid_namespace *pid_ns,
+ struct user_namespace *user_ns)
{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->mask & (1 << Opt_gid))
+ pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+ if (ctx->mask & (1 << Opt_hidepid))
+ pid_ns->hide_pid = ctx->hidepid;
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+ struct inode *root_inode;
+ int ret;
+
+ proc_apply_options(s, fc, pid_ns, current_user_ns());
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = PROC_SUPER_MAGIC;
+ s->s_op = &proc_sops;
+ s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
+ pde_get(&proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
+
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
+
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
struct pid_namespace *pid = sb->s_fs_info;
sync_filesystem(sb);
- return !proc_parse_options(data, pid);
+
+ proc_apply_options(sb, fc, pid, current_user_ns());
+ return 0;
}
-static struct dentry *proc_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int proc_get_tree(struct fs_context *fc)
{
- struct pid_namespace *ns;
+ struct proc_fs_context *ctx = fc->fs_private;
- if (flags & SB_KERNMOUNT) {
- ns = data;
- data = NULL;
- } else {
- ns = task_active_pid_ns(current);
- }
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ fc->s_fs_info = ctx->pid_ns;
+ return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super);
+}
- return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+static void proc_fs_context_free(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->pid_ns)
+ put_pid_ns(ctx->pid_ns);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+ .free = proc_fs_context_free,
+ .parse_param = proc_parse_param,
+ .get_tree = proc_get_tree,
+ .reconfigure = proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->fs_private = ctx;
+ fc->ops = &proc_fs_context_ops;
+ return 0;
}
static void proc_kill_sb(struct super_block *sb)
@@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb)
}
static struct file_system_type proc_fs_type = {
- .name = "proc",
- .mount = proc_mount,
- .kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "proc",
+ .init_fs_context = proc_init_fs_context,
+ .parameters = &proc_fs_parameters,
+ .kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -154,9 +247,9 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
-
+
return proc_lookup(dir, dentry, flags);
}
@@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = {
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
+ struct proc_fs_context *ctx;
+ struct fs_context *fc;
struct vfsmount *mnt;
- mnt = kern_mount_data(&proc_fs_type, ns);
+ fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (fc->user_ns != ns->user_ns) {
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ns->user_ns);
+ }
+
+ ctx = fc->fs_private;
+ if (ctx->pid_ns != ns) {
+ put_pid_ns(ctx->pid_ns);
+ get_pid_ns(ns);
+ ctx->pid_ns = ns;
+ }
+
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 535eda7857cf..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
@@ -79,6 +79,31 @@ static u64 get_iowait_time(int cpu)
#endif
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, nr_irqs - next);
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -95,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -130,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -156,9 +185,7 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
- /* sum again ? it could be updated? */
- for_each_irq_nr(j)
- seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+ show_all_irqs(p);
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85b0ef890b28..92a91e7816d8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
- SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
@@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68dedb2..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(current->files);
- if (current->sighand && atomic_read(&current->sighand->count) > 1)
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
sbytes += kobjsize(current->sighand);
else
bytes += kobjsize(current->sighand);
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2d1066ed3c28..75887a269b64 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -501,6 +501,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
struct pstore_record record;
+ if (!c)
+ return;
+
pstore_record_init(&record, psinfo);
record.type = PSTORE_TYPE_CONSOLE;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 898c8321b343..c5c685589e36 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -110,7 +110,6 @@ struct ramoops_context {
};
static struct platform_device *dummy;
-static struct ramoops_platform_data *dummy_data;
static int ramoops_pstore_open(struct pstore_info *psi)
{
@@ -346,17 +345,15 @@ out:
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
struct pstore_record *record)
{
- char *hdr;
+ char hdr[36]; /* "===="(4), %lld(20), "."(1), %06lu(6), "-%c\n"(3) */
size_t len;
- hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
+ len = scnprintf(hdr, sizeof(hdr),
+ RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
(time64_t)record->time.tv_sec,
record->time.tv_nsec / 1000,
record->compressed ? 'C' : 'D');
- WARN_ON_ONCE(!hdr);
- len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
- kfree(hdr);
return len;
}
@@ -424,6 +421,9 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
+ if (!hlen)
+ return -ENOMEM;
+
size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
@@ -716,15 +716,6 @@ static int ramoops_probe(struct platform_device *pdev)
phys_addr_t paddr;
int err = -EINVAL;
- if (dev_of_node(dev) && !pdata) {
- pdata = &pdata_local;
- memset(pdata, 0, sizeof(*pdata));
-
- err = ramoops_parse_dt(pdev, pdata);
- if (err < 0)
- goto fail_out;
- }
-
/*
* Only a single ramoops area allowed at a time, so fail extra
* probes.
@@ -734,6 +725,15 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
+ if (dev_of_node(dev) && !pdata) {
+ pdata = &pdata_local;
+ memset(pdata, 0, sizeof(*pdata));
+
+ err = ramoops_parse_dt(pdev, pdata);
+ if (err < 0)
+ goto fail_out;
+ }
+
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
@@ -892,13 +892,12 @@ static inline void ramoops_unregister_dummy(void)
{
platform_device_unregister(dummy);
dummy = NULL;
-
- kfree(dummy_data);
- dummy_data = NULL;
}
static void __init ramoops_register_dummy(void)
{
+ struct ramoops_platform_data pdata;
+
/*
* Prepare a dummy platform data structure to carry the module
* parameters. If mem_size isn't set, then there are no module
@@ -909,30 +908,25 @@ static void __init ramoops_register_dummy(void)
pr_info("using module parameters\n");
- dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
- if (!dummy_data) {
- pr_info("could not allocate pdata\n");
- return;
- }
-
- dummy_data->mem_size = mem_size;
- dummy_data->mem_address = mem_address;
- dummy_data->mem_type = mem_type;
- dummy_data->record_size = record_size;
- dummy_data->console_size = ramoops_console_size;
- dummy_data->ftrace_size = ramoops_ftrace_size;
- dummy_data->pmsg_size = ramoops_pmsg_size;
- dummy_data->dump_oops = dump_oops;
- dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+ memset(&pdata, 0, sizeof(pdata));
+ pdata.mem_size = mem_size;
+ pdata.mem_address = mem_address;
+ pdata.mem_type = mem_type;
+ pdata.record_size = record_size;
+ pdata.console_size = ramoops_console_size;
+ pdata.ftrace_size = ramoops_ftrace_size;
+ pdata.pmsg_size = ramoops_pmsg_size;
+ pdata.dump_oops = dump_oops;
+ pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
*/
- dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+ pdata.ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
dummy = platform_device_register_data(NULL, "ramoops", -1,
- dummy_data, sizeof(struct ramoops_platform_data));
+ &pdata, sizeof(pdata));
if (IS_ERR(dummy)) {
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
diff --git a/fs/read_write.c b/fs/read_write.c
index ff3c5e6f87cf..177ccc3d405a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
ssize_t result;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read(file, (void __user *)buf, count, pos);
set_fs(old_fs);
@@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
return ret;
}
-ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
- loff_t *pos)
+static ssize_t __vfs_write(struct file *file, const char __user *p,
+ size_t count, loff_t *pos)
{
if (file->f_op->write)
return file->f_op->write(file, p, count, pos);
@@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
return -EINVAL;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
@@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_write(file, (__force const char __user *)buf, count, pos);
set_fs(old_fs);
@@ -1238,6 +1238,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_readv(fd, vec, vlen, flags);
+
return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
#endif
@@ -1344,6 +1347,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_writev(fd, vec, vlen, flags);
+
return do_compat_pwritev64(fd, vec, vlen, pos, flags);
}
#endif
diff --git a/fs/select.c b/fs/select.c
index d0f35dbc0e8f..6cbc9ff56ba0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct old_timespec32 __user *, tsp, void __user *, sig)
{
@@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
#endif
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
diff --git a/fs/splice.c b/fs/splice.c
index de2ede048473..3ee7e82df48f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -138,7 +138,6 @@ error:
}
const struct pipe_buf_operations page_cache_pipe_buf_ops = {
- .can_merge = 0,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
@@ -326,7 +324,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
EXPORT_SYMBOL(generic_file_splice_read);
const struct pipe_buf_operations default_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -341,7 +338,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_nosteal,
@@ -357,7 +353,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
@@ -1123,6 +1119,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (ipipe == opipe)
return -EINVAL;
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}
@@ -1148,6 +1147,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (unlikely(ret < 0))
return ret;
+ if (in->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
file_start_write(out);
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
@@ -1172,6 +1174,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
offset = in->f_pos;
}
+ if (out->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
@@ -1597,6 +1602,8 @@ retry:
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
obuf->len = len;
opipe->nrbufs++;
ibuf->offset += obuf->len;
@@ -1671,6 +1678,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
if (obuf->len > len)
obuf->len = len;
@@ -1717,6 +1726,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* copying the data.
*/
if (ipipe && opipe && ipipe != opipe) {
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
/*
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
diff --git a/fs/stat.c b/fs/stat.c
index adbfcd86c81b..c38e4c2e1221 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
stat->ctime = inode->i_ctime;
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
-
- if (IS_NOATIME(inode))
- stat->result_mask &= ~STATX_ATIME;
- if (IS_AUTOMOUNT(inode))
- stat->attributes |= STATX_ATTR_AUTOMOUNT;
}
EXPORT_SYMBOL(generic_fillattr);
@@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->result_mask |= STATX_BASIC_STATS;
request_mask &= STATX_ALL;
query_flags &= KSTAT_QUERY_FLAGS;
+
+ /* allow the fs to override these if it really wants to */
+ if (IS_NOATIME(inode))
+ stat->result_mask &= ~STATX_ATIME;
+ if (IS_AUTOMOUNT(inode))
+ stat->attributes |= STATX_ATTR_AUTOMOUNT;
+
if (inode->i_op->getattr)
return inode->i_op->getattr(path, stat, request_mask,
query_flags);
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..eea7af6f2f22 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
return retval;
}
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+ struct kstatfs st;
+ int error;
+
+ error = statfs_by_dentry(dentry, &st);
+ if (error)
+ return error;
+
+ *fsid = st.f_fsid;
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
int error;
diff --git a/fs/super.c b/fs/super.c
index 48e25eba8465..583a0124bc39 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"
@@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb)
EXPORT_SYMBOL(generic_shutdown_super);
/**
+ * sget_fc - Find or create a superblock
+ * @fc: Filesystem context.
+ * @test: Comparison callback
+ * @set: Setup callback
+ *
+ * Find or create a superblock using the parameters stored in the filesystem
+ * context and the two callback functions.
+ *
+ * If an extant superblock is matched, then that will be returned with an
+ * elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info and s_id will be set and
+ * the set() callback will be invoked), the superblock will be published and it
+ * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
+ * as yet unset.
+ */
+struct super_block *sget_fc(struct fs_context *fc,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*set)(struct super_block *, struct fs_context *))
+{
+ struct super_block *s = NULL;
+ struct super_block *old;
+ struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
+ int err;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT) &&
+ fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) {
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ } else {
+ if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+ }
+
+retry:
+ spin_lock(&sb_lock);
+ if (test) {
+ hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
+ if (test(old, fc))
+ goto share_extant_sb;
+ }
+ }
+ if (!s) {
+ spin_unlock(&sb_lock);
+ s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+ if (!s)
+ return ERR_PTR(-ENOMEM);
+ goto retry;
+ }
+
+ s->s_fs_info = fc->s_fs_info;
+ err = set(s, fc);
+ if (err) {
+ s->s_fs_info = NULL;
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(err);
+ }
+ fc->s_fs_info = NULL;
+ s->s_type = fc->fs_type;
+ strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ list_add_tail(&s->s_list, &super_blocks);
+ hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+ spin_unlock(&sb_lock);
+ get_filesystem(s->s_type);
+ register_shrinker_prepared(&s->s_shrink);
+ return s;
+
+share_extant_sb:
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(-EBUSY);
+ }
+ if (!grab_super(old))
+ goto retry;
+ destroy_unused_super(s);
+ return old;
+}
+EXPORT_SYMBOL(sget_fc);
+
+/**
* sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
@@ -835,28 +924,35 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
- * @sb: superblock in question
- * @sb_flags: revised superblock flags
- * @data: the rest of options
- * @force: whether or not to force the change
+ * reconfigure_super - asks filesystem to change superblock parameters
+ * @fc: The superblock and configuration
*
- * Alters the mount options of a mounted file system.
+ * Alters the configuration parameters of a live superblock.
*/
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int reconfigure_super(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
int retval;
- int remount_ro;
+ bool remount_ro = false;
+ bool force = fc->sb_flags & SB_FORCE;
+ if (fc->sb_flags_mask & ~MS_RMT_MASK)
+ return -EINVAL;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
+ retval = security_sb_remount(sb, fc->security);
+ if (retval)
+ return retval;
+
+ if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
- if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
- return -EACCES;
+ if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+ return -EACCES;
#endif
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ }
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
@@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = !sb_rdonly(sb);
}
}
shrink_dcache_sb(sb);
- /* If we are remounting RDONLY and current sb is read/write,
- make sure there are no rw files opened */
+ /* If we are reconfiguring to RDONLY and current sb is read/write,
+ * make sure there are no files open for writing.
+ */
if (remount_ro) {
if (force) {
sb->s_readonly_remount = 1;
@@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
- retval = sb->s_op->remount_fs(sb, &sb_flags, data);
+ if (fc->ops->reconfigure) {
+ retval = fc->ops->reconfigure(fc);
if (retval) {
if (!force)
goto cancel_readonly;
@@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
sb->s_type->name, retval);
}
}
- sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
+
+ WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
+ (fc->sb_flags & fc->sb_flags_mask)));
/* Needs to be ordered wrt mnt_is_readonly() */
smp_wmb();
sb->s_readonly_remount = 0;
@@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb)
down_write(&sb->s_umount);
if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
!sb_rdonly(sb)) {
- /*
- * What lock protects sb->s_flags??
- */
- do_remount_sb(sb, SB_RDONLY, NULL, 1);
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root,
+ SB_RDONLY | SB_FORCE, SB_RDONLY);
+ if (!IS_ERR(fc)) {
+ if (parse_monolithic_mount_data(fc, NULL) == 0)
+ (void)reconfigure_super(fc);
+ put_fs_context(fc);
+ }
}
up_write(&sb->s_umount);
}
@@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_ns);
+int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
+{
+ return set_anon_super(sb, NULL);
+}
+EXPORT_SYMBOL(set_anon_super_fc);
+
+static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
+{
+ return sb->s_fs_info == fc->s_fs_info;
+}
+
+static int test_single_super(struct super_block *s, struct fs_context *fc)
+{
+ return 1;
+}
+
+/**
+ * vfs_get_super - Get a superblock with a search key set in s_fs_info.
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ *
+ * Search for a superblock and create a new one if not found. The search
+ * criterion is controlled by @keying. If the search fails, a new superblock
+ * is created and @fill_super() is called to initialise it.
+ *
+ * @keying can take one of a number of values:
+ *
+ * (1) vfs_get_single_super - Only one superblock of this type may exist on the
+ * system. This is typically used for special system filesystems.
+ *
+ * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
+ * distinct keys (where the key is in s_fs_info). Searching for the same
+ * key again will turn up the superblock for that key.
+ *
+ * (3) vfs_get_independent_super - Multiple superblocks may exist and are
+ * unkeyed. Each call will get a new superblock.
+ *
+ * A permissions check is made by sget_fc() unless we're getting a superblock
+ * for a kernel-internal mount or a submount.
+ */
+int vfs_get_super(struct fs_context *fc,
+ enum vfs_get_super_keying keying,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
+{
+ int (*test)(struct super_block *, struct fs_context *);
+ struct super_block *sb;
+
+ switch (keying) {
+ case vfs_get_single_super:
+ test = test_single_super;
+ break;
+ case vfs_get_keyed_super:
+ test = test_keyed_super;
+ break;
+ case vfs_get_independent_super:
+ test = NULL;
+ break;
+ default:
+ BUG();
+ }
+
+ sb = sget_fc(fc, test, set_anon_super_fc);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ int err = fill_super(sb, fc);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ BUG_ON(fc->root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_super);
+
#ifdef CONFIG_BLOCK
static int set_bdev_super(struct super_block *s, void *data)
{
@@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
+static int reconfigure_single(struct super_block *s,
+ int flags, void *data)
+{
+ struct fs_context *fc;
+ int ret;
+
+ /* The caller really need to be passing fc down into mount_single(),
+ * then a chunk of this can be removed. [Bollocks -- AV]
+ * Better yet, reconfiguration shouldn't happen, but rather the second
+ * mount should be rejected if the parameters are not compatible.
+ */
+ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ ret = parse_monolithic_mount_data(fc, data);
+ if (ret < 0)
+ goto out;
+
+ ret = reconfigure_super(fc);
+out:
+ put_fs_context(fc);
+ return ret;
+}
+
static int compare_single(struct super_block *s, void *p)
{
return 1;
@@ -1229,41 +1441,64 @@ struct dentry *mount_single(struct file_system_type *fs_type,
return ERR_CAST(s);
if (!s->s_root) {
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- s->s_flags |= SB_ACTIVE;
+ if (!error)
+ s->s_flags |= SB_ACTIVE;
} else {
- do_remount_sb(s, flags, data, 0);
+ error = reconfigure_single(s, flags, data);
+ }
+ if (unlikely(error)) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
}
return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);
-struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_get_tree - Get the mountable root
+ * @fc: The superblock configuration context.
+ *
+ * The filesystem is invoked to get or create a superblock which can then later
+ * be used for mounting. The filesystem places a pointer to the root to be
+ * used for mounting in @fc->root.
+ */
+int vfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
struct super_block *sb;
- int error = -ENOMEM;
- void *sec_opts = NULL;
+ int error;
- if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
- error = security_sb_eat_lsm_opts(data, &sec_opts);
- if (error)
- return ERR_PTR(error);
+ if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) {
+ errorf(fc, "Filesystem requires source device");
+ return -ENOENT;
}
- root = type->mount(type, flags, name, data);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- goto out_free_secdata;
+ if (fc->root)
+ return -EBUSY;
+
+ /* Get the mountable root in fc->root, with a ref on the root and a ref
+ * on the superblock.
+ */
+ error = fc->ops->get_tree(fc);
+ if (error < 0)
+ return error;
+
+ if (!fc->root) {
+ pr_err("Filesystem %s get_tree() didn't set fc->root\n",
+ fc->fs_type->name);
+ /* We don't know what the locking state of the superblock is -
+ * if there is a superblock.
+ */
+ BUG();
}
- sb = root->d_sb;
- BUG_ON(!sb);
+
+ sb = fc->root->d_sb;
WARN_ON(!sb->s_bdi);
+ if (fc->subtype && !sb->s_subtype) {
+ sb->s_subtype = fc->subtype;
+ fc->subtype = NULL;
+ }
+
/*
* Write barrier is for super_cache_count(). We place it before setting
* SB_BORN as the data dependency between the two functions is the
@@ -1273,14 +1508,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
smp_wmb();
sb->s_flags |= SB_BORN;
- error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
- if (error)
- goto out_sb;
-
- if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
- error = security_sb_kern_mount(sb);
- if (error)
- goto out_sb;
+ error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
}
/*
@@ -1290,18 +1521,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
* violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
- "negative value (%lld)\n", type->name, sb->s_maxbytes);
+ "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
- up_write(&sb->s_umount);
- security_free_mnt_opts(&sec_opts);
- return root;
-out_sb:
- dput(root);
- deactivate_locked_super(sb);
-out_free_secdata:
- security_free_mnt_opts(&sec_opts);
- return ERR_PTR(error);
+ return 0;
}
+EXPORT_SYMBOL(vfs_get_tree);
/*
* Setup private BDI for given superblock. It gets automatically cleaned up
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 51398457fe00..130fc6fbcc03 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -17,7 +17,6 @@
#include <linux/seq_file.h>
#include "sysfs.h"
-#include "../kernfs/kernfs-internal.h"
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
@@ -497,6 +496,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
int i;
+
for (i = 0; ptr[i]; i++)
sysfs_remove_file(kobj, ptr[i]);
}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92682fcc41f6..4cb21b558a85 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,34 +13,69 @@
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <net/net_namespace.h>
#include "sysfs.h"
static struct kernfs_root *sysfs_root;
struct kernfs_node *sysfs_root_kn;
-static struct dentry *sysfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int sysfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
- void *ns;
- bool new_sb = false;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+ int ret;
- if (!(flags & SB_KERNMOUNT)) {
+ ret = kernfs_get_tree(fc);
+ if (ret)
+ return ret;
+
+ if (kfc->new_sb_created)
+ fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ return 0;
+}
+
+static void sysfs_fs_context_free(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ if (kfc->ns_tag)
+ kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
+ kernfs_free_fs_context(fc);
+ kfree(kfc);
+}
+
+static const struct fs_context_operations sysfs_fs_context_ops = {
+ .free = sysfs_fs_context_free,
+ .get_tree = sysfs_get_tree,
+};
+
+static int sysfs_init_fs_context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc;
+ struct net *netns;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT)) {
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
- ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
- root = kernfs_mount_ns(fs_type, flags, sysfs_root,
- SYSFS_MAGIC, &new_sb, ns);
- if (!new_sb)
- kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
- else if (!IS_ERR(root))
- root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
+ if (!kfc)
+ return -ENOMEM;
- return root;
+ kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+ kfc->root = sysfs_root;
+ kfc->magic = SYSFS_MAGIC;
+ fc->fs_private = kfc;
+ fc->ops = &sysfs_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(netns->user_ns);
+ fc->global = true;
+ return 0;
}
static void sysfs_kill_sb(struct super_block *sb)
@@ -52,10 +87,10 @@ static void sysfs_kill_sb(struct super_block *sb)
}
static struct file_system_type sysfs_fs_type = {
- .name = "sysfs",
- .mount = sysfs_mount,
- .kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "sysfs",
+ .init_fs_context = sysfs_init_fs_context,
+ .kill_sb = sysfs_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 803ca070d42e..6a6fc8aa1de7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
const struct old_itimerspec32 __user *, utmr,
struct old_itimerspec32 __user *, otmr)
{
@@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
return ret;
}
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
struct old_itimerspec32 __user *, otmr)
{
struct itimerspec64 kotmr;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bc1e082d921d..9da2f135121b 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -8,6 +8,7 @@ config UBIFS_FS
select CRYPTO_LZO if UBIFS_FS_LZO
select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
select CRYPTO_HASH_INFO
+ select UBIFS_FS_XATTR if FS_ENCRYPTION
depends on MTD_UBI
help
UBIFS is a file system for flash devices which works on top of UBI.
@@ -60,17 +61,6 @@ config UBIFS_FS_XATTR
If unsure, say Y.
-config UBIFS_FS_ENCRYPTION
- bool "UBIFS Encryption"
- depends on UBIFS_FS_XATTR && BLOCK
- select FS_ENCRYPTION
- default n
- help
- Enable encryption of UBIFS files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config UBIFS_FS_SECURITY
bool "UBIFS Security Labels"
depends on UBIFS_FS_XATTR
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 5f838319c8d5..5c4b845754a7 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
ubifs-y += misc.o
-ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
+ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index daf9f93e15de..82e4e6a30b04 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -193,7 +193,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
case FS_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct ubifs_info *c = inode->i_sb->s_fs_info;
err = ubifs_enable_encryption(c);
@@ -206,7 +206,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
}
case FS_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return fscrypt_ioctl_get_policy(file, (void __user *)arg);
#else
return -EOPNOTSUPP;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3da90c951c23..67fac1e8adfb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
goto out;
}
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
if (c->encrypted) {
ubifs_err(c, "file system contains encrypted files but UBIFS"
" was built without crypto support.");
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fac1133dadd..8dc2818fdd84 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2146,7 +2146,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_UBIFS_FS_XATTR
sb->s_xattr = ubifs_xattr_handlers;
#endif
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ubifs_crypt_operations;
#endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 38401adaa00d..1ae12900e01d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -43,7 +43,6 @@
#include <crypto/hash.h>
#include <crypto/algapi.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include "ubifs-media.h"
@@ -142,7 +141,7 @@
*/
#define WORST_COMPR_FACTOR 2
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
#else
#define UBIFS_CIPHER_BLOCK_SIZE 0
@@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
static inline int ubifs_encrypt(const struct inode *inode,
struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e3d684ea3203..ffd8038ff728 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
ret = 0;
+
+ if (!sbi->s_lvid_bh) {
+ /* We can't generate unique IDs without a valid LVID */
+ if (sb_rdonly(sb)) {
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ } else {
+ udf_warn(sb, "Damaged or missing LVID, forcing "
+ "readonly mount\n");
+ ret = -EACCES;
+ }
+ }
out_bh:
brelse(bh);
return ret;
@@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return 0;
}
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
static void udf_open_lvid(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
else
UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
/*
* We set buffer uptodate unconditionally here to avoid spurious
* warnings from mark_buffer_dirty() when previous EIO has marked
* the buffer as !uptodate
*/
set_buffer_uptodate(bh);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
if (!(++uniqueID & 0xFFFFFFFF))
uniqueID += 16;
lvhd->uniqueID = cpu_to_le64(uniqueID);
+ udf_updated_lvid(sb);
mutex_unlock(&sbi->s_alloc_mutex);
- mark_buffer_dirty(bh);
return ret;
}
@@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
mutex_lock(&sbi->s_alloc_mutex);
if (sbi->s_lvid_dirty) {
+ struct buffer_head *bh = sbi->s_lvid_bh;
+ struct logicalVolIntegrityDesc *lvid;
+
+ lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ udf_finalize_lvid(lvid);
+
/*
* Blockdevice will be synced later so we don't have to submit
* the buffer for IO
*/
- mark_buffer_dirty(sbi->s_lvid_bh);
+ mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
}
mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/utimes.c b/fs/utimes.c
index bdcf2daf39c1..350c9c16ace1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
* of sys_utimes.
*/
#ifdef __ARCH_WANT_SYS_UTIME32
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct old_utimbuf32 __user *, t)
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+ struct old_utimbuf32 __user *, t)
{
struct timespec64 tv[2];
@@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
}
#endif
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
{
struct timespec64 tv[2];
@@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
const char __user *, filename,
struct old_timeval32 __user *, t)
{
return do_compat_futimesat(dfd, filename, t);
}
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
{
return do_compat_futimesat(AT_FDCWD, filename, t);
}
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 999ad8d00d43..1ef8acf35e7d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..e2ba2a3b63b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b715668886a4..bc3367b8b7bb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -568,9 +568,9 @@ xfs_agfl_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..9fe949f6055e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
- *
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..2dd9ee2a2e08 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..2297d8467666 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2652d00842d6..1f6e3965ff74 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
struct xfs_attr_leaf_entry *entries;
uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..65ff600a8067 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 332eefa2700b..48502cb9990f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -2029,7 +2031,7 @@ done:
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
/*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
*/
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
/*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
*/
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ if (error)
+ goto out_finish;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 09d3ea97cc15..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..aff82ed112c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..e2737e2ac2ae 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..156ce95c9c45 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..b7d6d78f4ce2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..b7b9ce002cb9 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..9a3767818c50 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_failaddr_t fa;
@@ -185,23 +166,22 @@ __read_verify(
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +196,22 @@ __write_verify(
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..3b03703c5c3d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..fb5bd9a804f6 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d32152fc8a6c..fe9898875097 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b25e7a0df47..1080381ff243 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -260,6 +260,9 @@ xfs_inobt_verify(
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..bc690f2409fa 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..e021d5133ccb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d9eab657b63e..6f47ab876d90 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..5738e11055e6 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..77a3a4085de3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -781,12 +782,14 @@ out_error:
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..4e909791aeac 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 77d80106f989..a0ccc253c43d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -95,7 +95,7 @@ xfs_symlink_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 3306fc42cfad..de310712dd6d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -116,6 +116,19 @@ xfs_verify_agino(
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -204,3 +217,14 @@ xfs_verify_icount(
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8f02855a019a..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..ddf06bfaa29d 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -864,19 +864,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 03d1e15cceba..64e31f87d490 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -341,23 +341,19 @@ xrep_agf(
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..dce74ec57038 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -82,12 +82,23 @@ xchk_xattr_listent(
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = 1;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..a703cd58a90e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..a38a22785a1a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -129,6 +129,12 @@ xchk_dir_actor(
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 882dc56c5c21..700114f79a7d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
struct xchk_iallocbt {
/* Number of inodes we see while scanning inobt. */
unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
};
/*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ freemask_ok = irec_free ^ !!(dip->di_mode);
if (!bs->sc->try_harder && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- nr_inodes = mp->m_inodes_per_cluster;
-
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += mp->m_inodes_per_cluster) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
-
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_inodes_per_cluster);
+
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
- continue;
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
+
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
- imap.im_boffset = 0;
-
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
}
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- return error;
+ if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
}
/* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (mp->m_cluster_align - 1)) ||
- (agbno & (mp->m_blocks_per_cluster - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
iabt->inodes += irec.ir_count;
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -429,6 +583,8 @@ xchk_iallocbt(
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
.inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
};
int error;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 6acf1bfa0bfe..f28f4bad317b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -743,7 +743,8 @@ xrep_findroot_block(
/* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f2fc18bb7605..d990314eb08b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..dbe115b075f7 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 8344b14031ef..3c83e8b3b39c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d9048bcea49c..3619e9e8d359 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,7 +28,8 @@
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -62,7 +63,7 @@ xfs_find_daxdev_for_inode(
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
@@ -98,6 +99,7 @@ xfs_destroy_ioend(
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -109,7 +111,7 @@ xfs_destroy_ioend(
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
@@ -255,30 +257,20 @@ xfs_end_io(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
@@ -293,7 +285,8 @@ xfs_end_bio(
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +294,75 @@ xfs_end_bio(
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -310,26 +372,16 @@ xfs_map_blocks(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +398,19 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +422,16 @@ xfs_map_blocks(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -417,51 +443,65 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -486,7 +526,7 @@ xfs_submit_ioend(
int status)
{
/* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
/*
* Yuk. This can do memory allocation, but is not a
* transactional operation so everything is done in GFP_KERNEL
@@ -504,7 +544,8 @@ xfs_submit_ioend(
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
@@ -533,7 +574,8 @@ xfs_submit_ioend(
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector)
@@ -547,7 +589,8 @@ xfs_alloc_ioend(
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
@@ -608,21 +651,23 @@ xfs_add_to_ioend(
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+ if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
@@ -723,7 +768,7 @@ xfs_writepage_map(
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -918,9 +963,7 @@ xfs_vm_writepage(
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -934,9 +977,7 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -983,7 +1024,7 @@ xfs_vm_bmap(
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e5c23948a8ab..6c2615b83c5d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,32 +9,12 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
- XFS_IO_HOLE, /* covers region without any block allocation */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_HOLE, "hole" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..3d213a7394c5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4f5f2ff3f70f..548344e25128 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -776,29 +776,24 @@ _xfs_buf_read(
}
/*
- * Set buffer ops on an unchecked buffer and validate it, if possible.
+ * Reverify a buffer found in cache without an attached ->b_ops.
*
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents. If the contents
- * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
*
- * Under normal operations, every in-core buffer must have buffer ops assigned
- * to them when the buffer is read in from disk so that we can validate the
- * metadata.
- *
- * However, there are two scenarios where one can encounter in-core buffers
- * that don't have buffer ops. The first is during log recovery of buffers on
- * a V4 filesystem, though these buffers are purged at the end of recovery.
- *
- * The other is online repair, which tries to match arbitrary metadata blocks
- * with btree types in order to find the root. If online repair doesn't match
- * the buffer with /any/ btree type, the buffer remains in memory in DONE state
- * with no ops, and a subsequent read_buf call from elsewhere will not set the
- * ops. This function helps us fix this situation.
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
*/
int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
@@ -840,7 +835,7 @@ xfs_buf_read_map(
return bp;
}
- xfs_buf_ensure_ops(bp, ops);
+ xfs_buf_reverify(bp, ops);
if (flags & XBF_ASYNC) {
/*
@@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..d0b96e071cec 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -125,6 +125,10 @@ struct xfs_buf_map {
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..a1e177f66404 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..1f2e2845eb76 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -872,14 +872,27 @@ xfs_file_fallocate(
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
@@ -1203,6 +1216,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f3ef70c542e1..584648582ba7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..d0d377384120 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae667ba74a1c..f643a9295179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
+
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
+ if (error)
+ return error;
+ ASSERT(old_agino == NULLAGINO);
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
if (error)
return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ if (error)
+ return error;
+
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
+
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
+ }
+
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
return 0;
}
@@ -1995,181 +2419,106 @@ xfs_iunlink(
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
+ }
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
/*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
}
- return 0;
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..63d323916bba 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
}
static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
- got.br_startoff = end_fsb; /* fake hole until the end */
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
- if (got.br_startoff <= offset_fsb) {
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
* existing extents that contain data.
*/
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- got.br_state != XFS_EXT_UNWRITTEN)) {
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got);
- if (error)
- goto out_unlock;
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
- }
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
- goto out_unlock;
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -647,186 +711,22 @@ retry:
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
- }
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
}
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -1009,7 +909,7 @@ relock:
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = {
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..74047bd0c1ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -191,9 +191,18 @@ xfs_generic_create(
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9fe88d125f0a..3371d1ff27c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..110f927cf943 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..c8ba98fae30a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c5b4fa004ca4..680ae7662a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false;
- struct xfs_iext_cursor icur;
- bool shared;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
+ *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, &shared);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
if (got.br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
got.br_startoff - imap->br_startoff);
- return xfs_reflink_trim_around_shared(ip, imap, shared);
+ return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..f093ea244849 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 11cff449d055..e1c7d55b32c3 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -17,7 +17,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_inode.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "bmap update done"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..7d65ebf1e847 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
* release this buffer when it kills the tranaction.
*/
ASSERT(bp->b_ops != NULL);
- error = xfs_buf_ensure_ops(bp, ops);
+ error = xfs_buf_reverify(bp, ops);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 0710434eb240..8ee7a3f8bb20 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,7 +18,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate an "extent free done"
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 6c947ff4faf6..8d734728dd1b 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -16,7 +16,6 @@
#include "xfs_refcount_item.h"
#include "xfs_alloc.h"
#include "xfs_refcount.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "refcount update done"
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index a42890931ecd..5c7936b1be13 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -16,7 +16,6 @@
#include "xfs_rmap_item.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "xfs_defer.h"
/* Set the map extent flags for this reverse mapping. */
static void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..9a63016009a1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;