aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir.c23
-rw-r--r--fs/aio.c9
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c239
-rw-r--r--fs/btrfs/backref.h21
-rw-r--r--fs/btrfs/block-group.c180
-rw-r--r--fs/btrfs/block-group.h39
-rw-r--r--fs/btrfs/block-rsv.c3
-rw-r--r--fs/btrfs/block-rsv.h9
-rw-r--r--fs/btrfs/btrfs_inode.h25
-rw-r--r--fs/btrfs/compression.c99
-rw-r--r--fs/btrfs/ctree.c43
-rw-r--r--fs/btrfs/ctree.h370
-rw-r--r--fs/btrfs/delalloc-space.c13
-rw-r--r--fs/btrfs/delalloc-space.h3
-rw-r--r--fs/btrfs/delayed-inode.c292
-rw-r--r--fs/btrfs/delayed-inode.h34
-rw-r--r--fs/btrfs/dev-replace.c16
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/disk-io.c303
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-io-tree.c1674
-rw-r--r--fs/btrfs/extent-io-tree.h126
-rw-r--r--fs/btrfs/extent-tree.c33
-rw-r--r--fs/btrfs/extent_io.c2908
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c347
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c38
-rw-r--r--fs/btrfs/file.c805
-rw-r--r--fs/btrfs/free-space-cache.c115
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/free-space-tree.c8
-rw-r--r--fs/btrfs/inode.c524
-rw-r--r--fs/btrfs/ioctl.c24
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/locking.h1
-rw-r--r--fs/btrfs/misc.h35
-rw-r--r--fs/btrfs/ordered-data.c50
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/props.c5
-rw-r--r--fs/btrfs/qgroup.c96
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid56.c45
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/reflink.c10
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c16
-rw-r--r--fs/btrfs/scrub.c668
-rw-r--r--fs/btrfs/send.c464
-rw-r--r--fs/btrfs/send.h20
-rw-r--r--fs/btrfs/space-info.c96
-rw-r--r--fs/btrfs/space-info.h9
-rw-r--r--fs/btrfs/subpage.c2
-rw-r--r--fs/btrfs/super.c112
-rw-r--r--fs/btrfs/sysfs.c172
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c39
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/tests/inode-tests.c10
-rw-r--r--fs/btrfs/transaction.c162
-rw-r--r--fs/btrfs/tree-log.c1543
-rw-r--r--fs/btrfs/tree-log.h8
-rw-r--r--fs/btrfs/verity.c3
-rw-r--r--fs/btrfs/volumes.c353
-rw-r--r--fs/btrfs/volumes.h50
-rw-r--r--fs/btrfs/zoned.c142
-rw-r--r--fs/buffer.c195
-rw-r--r--fs/cachefiles/namei.c122
-rw-r--r--fs/ceph/caps.c14
-rw-r--r--fs/ceph/export.c3
-rw-r--r--fs/ceph/inode.c31
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/cifs/cached_dir.c490
-rw-r--r--fs/cifs/cached_dir.h30
-rw-r--r--fs/cifs/cifs_debug.c4
-rw-r--r--fs/cifs/cifs_debug.h6
-rw-r--r--fs/cifs/cifs_ioctl.h8
-rw-r--r--fs/cifs/cifs_swn.c12
-rw-r--r--fs/cifs/cifsencrypt.c100
-rw-r--r--fs/cifs/cifsfs.c16
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h78
-rw-r--r--fs/cifs/cifspdu.h7
-rw-r--r--fs/cifs/cifsproto.h20
-rw-r--r--fs/cifs/cifssmb.c4
-rw-r--r--fs/cifs/connect.c43
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/dir.c46
-rw-r--r--fs/cifs/file.c65
-rw-r--r--fs/cifs/fs_context.c12
-rw-r--r--fs/cifs/fscache.c2
-rw-r--r--fs/cifs/inode.c184
-rw-r--r--fs/cifs/ioctl.c25
-rw-r--r--fs/cifs/link.c120
-rw-r--r--fs/cifs/misc.c63
-rw-r--r--fs/cifs/readdir.c31
-rw-r--r--fs/cifs/sess.c49
-rw-r--r--fs/cifs/smb1ops.c56
-rw-r--r--fs/cifs/smb2file.c127
-rw-r--r--fs/cifs/smb2inode.c174
-rw-r--r--fs/cifs/smb2misc.c15
-rw-r--r--fs/cifs/smb2ops.c188
-rw-r--r--fs/cifs/smb2pdu.c119
-rw-r--r--fs/cifs/smb2pdu.h3
-rw-r--r--fs/cifs/smb2proto.h25
-rw-r--r--fs/cifs/smb2transport.c98
-rw-r--r--fs/cifs/smbdirect.c227
-rw-r--r--fs/cifs/smbdirect.h14
-rw-r--r--fs/cifs/trace.h3
-rw-r--r--fs/cifs/transport.c5
-rw-r--r--fs/coredump.c43
-rw-r--r--fs/crypto/keyring.c17
-rw-r--r--fs/d_path.c5
-rw-r--r--fs/dcache.c17
-rw-r--r--fs/debugfs/file.c16
-rw-r--r--fs/debugfs/inode.c37
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c40
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/vars.c16
-rw-r--r--fs/erofs/fscache.c3
-rw-r--r--fs/erofs/zdata.c30
-rw-r--r--fs/erofs/zdata.h6
-rw-r--r--fs/erofs/zmap.c22
-rw-r--r--fs/eventfd.c10
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c23
-rw-r--r--fs/exfat/dir.c6
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exportfs/expfs.c7
-rw-r--r--fs/ext2/balloc.c7
-rw-r--r--fs/ext2/ialloc.c3
-rw-r--r--fs/ext2/namei.c6
-rw-r--r--fs/ext2/super.c22
-rw-r--r--fs/ext4/ext4.h5
-rw-r--r--fs/ext4/extents.c107
-rw-r--r--fs/ext4/extents_status.c3
-rw-r--r--fs/ext4/fast_commit.c210
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/inode.c17
-rw-r--r--fs/ext4/ioctl.c11
-rw-r--r--fs/ext4/migrate.c3
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c26
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c1240
-rw-r--r--fs/ext4/verity.c9
-rw-r--r--fs/ext4/xattr.c1
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c65
-rw-r--r--fs/f2fs/compress.c32
-rw-r--r--fs/f2fs/data.c53
-rw-r--r--fs/f2fs/debug.c9
-rw-r--r--fs/f2fs/dir.c1
-rw-r--r--fs/f2fs/extent_cache.c9
-rw-r--r--fs/f2fs/f2fs.h48
-rw-r--r--fs/f2fs/file.c57
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inline.c17
-rw-r--r--fs/f2fs/inode.c51
-rw-r--r--fs/f2fs/iostat.c74
-rw-r--r--fs/f2fs/iostat.h4
-rw-r--r--fs/f2fs/namei.c15
-rw-r--r--fs/f2fs/node.c9
-rw-r--r--fs/f2fs/recovery.c29
-rw-r--r--fs/f2fs/segment.c45
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c96
-rw-r--r--fs/f2fs/sysfs.c9
-rw-r--r--fs/f2fs/verity.c15
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fat/dir.c8
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c7
-rw-r--r--fs/fs-writeback.c37
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/dir.c24
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/gfs2/export.c6
-rw-r--r--fs/gfs2/file.c29
-rw-r--r--fs/gfs2/glock.c257
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/inode.c13
-rw-r--r--fs/gfs2/main.c24
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/ops_fstype.c31
-rw-r--r--fs/gfs2/quota.c8
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/util.c12
-rw-r--r--fs/hfs/bnode.c32
-rw-r--r--fs/hfs/btree.c29
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/bnode.c105
-rw-r--r--fs/hfsplus/btree.c27
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c342
-rw-r--r--fs/inode.c7
-rw-r--r--fs/internal.h12
-rw-r--r--fs/iomap/buffered-io.c3
-rw-r--r--fs/iomap/trace.h1
-rw-r--r--fs/isofs/compress.c22
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c34
-rw-r--r--fs/jbd2/recovery.c17
-rw-r--r--fs/jbd2/transaction.c6
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/kernfs/dir.c107
-rw-r--r--fs/kernfs/file.c151
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/ksmbd/auth.c15
-rw-r--r--fs/ksmbd/auth.h3
-rw-r--r--fs/ksmbd/connection.c8
-rw-r--r--fs/ksmbd/connection.h2
-rw-r--r--fs/ksmbd/ksmbd_netlink.h3
-rw-r--r--fs/ksmbd/mgmt/share_config.c36
-rw-r--r--fs/ksmbd/mgmt/share_config.h4
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c6
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h2
-rw-r--r--fs/ksmbd/misc.c46
-rw-r--r--fs/ksmbd/misc.h5
-rw-r--r--fs/ksmbd/ndr.c8
-rw-r--r--fs/ksmbd/oplock.c27
-rw-r--r--fs/ksmbd/server.c4
-rw-r--r--fs/ksmbd/smb2pdu.c142
-rw-r--r--fs/ksmbd/smb2pdu.h7
-rw-r--r--fs/ksmbd/smb_common.c6
-rw-r--r--fs/ksmbd/smbacl.c12
-rw-r--r--fs/ksmbd/smbacl.h18
-rw-r--r--fs/ksmbd/transport_rdma.c8
-rw-r--r--fs/ksmbd/transport_tcp.c3
-rw-r--r--fs/ksmbd/unicode.h3
-rw-r--r--fs/ksmbd/vfs.c50
-rw-r--r--fs/ksmbd/vfs.h4
-rw-r--r--fs/libfs.c46
-rw-r--r--fs/mbcache.c17
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/namei.c94
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c109
-rw-r--r--fs/nfs/inode.c18
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs42proc.c6
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs42xdr.c8
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4proc.c18
-rw-r--r--fs/nfs/nfs4state.c13
-rw-r--r--fs/nfs/nfs4trace.h50
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/pnfs.c13
-rw-r--r--fs/nfs/pnfs.h9
-rw-r--r--fs/nfs/pnfs_nfs.c4
-rw-r--r--fs/nfsd/filecache.c88
-rw-r--r--fs/nfsd/nfs4recover.c8
-rw-r--r--fs/nfsd/nfs4state.c4
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/inode.c19
-rw-r--r--fs/nilfs2/page.c45
-rw-r--r--fs/nilfs2/segment.c25
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fanotify/fanotify.h8
-rw-r--r--fs/notify/fanotify/fanotify_user.c6
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/inode.c7
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/fslog.c6
-rw-r--r--fs/ntfs3/inode.c7
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/ocfs2_fs.h8
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stackglue.c4
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c11
-rw-r--r--fs/orangefs/dir.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/overlayfs/copy_up.c116
-rw-r--r--fs/overlayfs/file.c2
-rw-r--r--fs/overlayfs/inode.c6
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h34
-rw-r--r--fs/overlayfs/readdir.c44
-rw-r--r--fs/overlayfs/super.c18
-rw-r--r--fs/overlayfs/util.c10
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/posix_acl.c3
-rw-r--r--fs/proc/Kconfig1
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/proc/devices.c6
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/loadavg.c6
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/task_mmu.c96
-rw-r--r--fs/proc/task_nommu.c45
-rw-r--r--fs/proc/uptime.c6
-rw-r--r--fs/proc/version.c6
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/quota_tree.c73
-rw-r--r--fs/ramfs/file-nommu.c50
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/readdir.c68
-rw-r--r--fs/reiserfs/journal.c11
-rw-r--r--fs/reiserfs/prints.c2
-rw-r--r--fs/reiserfs/procfs.c4
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c20
-rw-r--r--fs/smbfs_common/smb2pdu.h6
-rw-r--r--fs/super.c3
-rw-r--r--fs/ubifs/crypto.c11
-rw-r--r--fs/ubifs/debug.c10
-rw-r--r--fs/ubifs/dir.c34
-rw-r--r--fs/ubifs/journal.c30
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c8
-rw-r--r--fs/udf/namei.c6
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/userfaultfd.c133
-rw-r--r--fs/verity/fsverity_private.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c50
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c4
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/xfs_attr_item.c6
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c13
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_inode_item_recover.c4
-rw-r--r--fs/xfs/xfs_iops.c22
-rw-r--r--fs/xfs/xfs_iops.h1
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_log.c12
-rw-r--r--fs/xfs/xfs_mount.c38
-rw-r--r--fs/xfs/xfs_notify_failure.c26
-rw-r--r--fs/xfs/xfs_reflink.c22
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_super.c10
-rw-r--r--fs/xfs/xfs_trace.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c8
389 files changed, 12898 insertions, 9199 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a547307c1ae8..2685a4d0d353 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on (SYSFS || SYSCTL)
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f14478643b91..93539aac0e5b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on ARM || ((M68K || SUPERH) && !MMU)
+ depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4c5f30a83336..58b391446ae1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
char *vol = match_strdup(&args[0]);
if (!vol)
return 0;
- strlcpy(volume, vol, 32);
+ strscpy(volume, vol, 32);
kfree(vol);
break;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 56ae5cd5184f..230c2d19116d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_iput(struct dentry *dentry, struct inode *inode);
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
@@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_one_cookie *cookie =
@@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
if (cookie->name.len != nlen ||
memcmp(cookie->name.name, name, nlen) != 0) {
- _leave(" = 0 [no]");
- return 0;
+ _leave(" = true [keep looking]");
+ return true;
}
cookie->fid.vnode = ino;
cookie->fid.unique = dtype;
cookie->found = 1;
- _leave(" = -1 [found]");
- return -1;
+ _leave(" = false [found]");
+ return false;
}
/*
@@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_cookie *cookie =
container_of(ctx, struct afs_lookup_cookie, ctx);
- int ret;
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
@@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
cookie->fids[1].unique = dtype;
cookie->found = 1;
if (cookie->one_only)
- return -1;
+ return false;
}
- ret = cookie->nr_fids >= 50 ? -1 : 0;
- _leave(" = %d", ret);
- return ret;
+ return cookie->nr_fids < 50;
}
/*
diff --git a/fs/aio.c b/fs/aio.c
index 606613e9d1f4..5b2ff20ad322 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx)
local_irq_save(flags);
kcpu = this_cpu_ptr(ctx->cpu);
if (!kcpu->reqs_available) {
- int old, avail = atomic_read(&ctx->reqs_available);
+ int avail = atomic_read(&ctx->reqs_available);
do {
if (avail < ctx->req_batch)
goto out;
-
- old = avail;
- avail = atomic_cmpxchg(&ctx->reqs_available,
- avail, avail - ctx->req_batch);
- } while (avail != old);
+ } while (!atomic_try_cmpxchg(&ctx->reqs_available,
+ &avail, avail - ctx->req_batch));
kcpu->reqs_available += ctx->req_batch;
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e0c3e33c4177..24192a7667ed 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -32,7 +32,7 @@ static struct inode *anon_inode_inode;
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+ return dynamic_dname(buffer, buflen, "anon_inode:%s",
dentry->d_name.name);
}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 12b8fdcc445b..9d1cde8066cf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -147,7 +147,7 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
}
static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
- struct inode *inode, struct dentry *dentry,
+ struct inode *inode, struct file *file,
umode_t mode)
{
return -EIO;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 63c7ebb0da89..6a11025e5850 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -911,7 +911,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
- goto out_free_ph;
+ goto out_free_file;
}
/* Get the exec headers */
@@ -1354,6 +1354,7 @@ out:
out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
+out_free_file:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99f9995670ea..fa9ddcc9eb0b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o
+ subpage.o tree-mod-log.o extent-io-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d385357e19b6..4ec18ceb2f21 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -138,6 +138,7 @@ struct share_check {
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -820,16 +821,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -855,10 +851,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -884,13 +886,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -920,7 +931,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1023,7 +1034,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1033,6 +1045,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1122,7 +1135,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1511,16 +1525,137 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
-/**
- * Check if an extent is shared or not
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!cache->use_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &cache->entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ cache->entries[i].is_shared = true;
+ cache->entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!cache->use_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &cache->entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &cache->entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
+ * Check if a data extent is shared or not.
*
- * @root: root inode belongs to
- * @inum: inode number of the inode whose extent we are checking
- * @bytenr: logical bytenr of the extent we are checking
- * @roots: list of roots this extent is shared among
- * @tmp: temporary list used for iteration
+ * @root: The root the inode belongs to.
+ * @inum: Number of the inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @roots: List of roots this extent is shared among.
+ * @tmp: Temporary list used for iteration.
+ * @cache: A backref lookup result cache.
*
- * btrfs_check_shared uses the backref walking code but will short
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
@@ -1531,8 +1666,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -1544,7 +1681,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
+ int level;
ulist_init(roots);
ulist_init(tmp);
@@ -1561,23 +1700,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
btrfs_get_tree_mod_seq(fs_info, &elem);
}
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
ULIST_ITER_INIT(&uiter);
+ cache->use_cache = true;
while (1) {
+ bool is_shared;
+ bool cached;
+
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, true);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
+ /*
+ * If our data extent is not shared through reflinks and it was
+ * created in a generation after the last one used to create a
+ * snapshot of the inode's root, then it can not be shared
+ * indirectly through subtrees, as that can only happen with
+ * snapshots. In this case bail out, no need to check for the
+ * sharedness of extent buffers.
+ */
+ if (level == -1 &&
+ extent_gen > btrfs_root_last_snapshot(&root->root_item))
+ break;
+
+ /*
+ * If our data extent was not directly shared (without multiple
+ * reference items), than it might have a single reference item
+ * with a count > 1 for the same offset, which means there are 2
+ * (or more) file extent items that point to the data extent -
+ * this happens when a file extent item needs to be split and
+ * then one item gets moved to another leaf due to a b+tree leaf
+ * split when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore some
+ * of the leaves may be referenced through shared subtrees while
+ * others are not. Since our extent buffer cache only works for
+ * a single path (by far the most common case and simpler to
+ * deal with), we can not use it if we have multiple leaves
+ * (which implies multiple paths).
+ */
+ if (level == -1 && tmp->nnodes > 1)
+ cache->use_cache = false;
+
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, false);
node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2759de7d324c..8e69584d538d 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -17,6 +17,21 @@ struct inode_fs_paths {
struct btrfs_data_container *fspath;
};
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+ bool use_cache;
+};
+
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
@@ -62,8 +77,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e0375ba9d0fe..deebc8ddbd93 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -593,8 +593,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -618,9 +616,6 @@ next:
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -655,7 +650,6 @@ next:
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
@@ -725,8 +719,6 @@ done:
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -755,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
@@ -772,7 +763,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
write_lock(&fs_info->block_group_cache_lock);
@@ -988,32 +978,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- write_lock(&fs_info->block_group_cache_lock);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- write_unlock(&fs_info->block_group_cache_lock);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1034,12 +1023,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
- WARN_ON(block_group->zone_is_active &&
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
block_group->space_info->active_total_bytes
< block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
- if (block_group->zone_is_active)
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
@@ -1069,7 +1059,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
@@ -1304,6 +1295,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
@@ -1543,6 +1537,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
if (!btrfs_should_reclaim(fs_info))
return;
@@ -1890,16 +1887,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
-
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1937,7 +1924,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -2002,7 +1990,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
int need_clear)
{
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
@@ -2078,11 +2065,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
/* Should not have any excluded extents. Just in case, though. */
btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, cache->start,
cache->start + cache->length);
@@ -2095,14 +2080,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super,
- cache->zone_unusable, cache->zone_is_active,
- &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2126,7 +2104,6 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct btrfs_space_info *space_info;
struct rb_node *node;
int ret = 0;
@@ -2146,7 +2123,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
/* Fill dummy cache as FULL */
bg->length = em->len;
bg->flags = map->type;
- bg->last_byte_to_unpin = (u64)-1;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->flags = map->type;
@@ -2167,10 +2143,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, 0, false, &space_info);
- bg->space_info = space_info;
- link_block_group(bg);
+ btrfs_add_bg_to_space_info(fs_info, bg);
set_avail_alloc_bits(fs_info, bg->flags);
}
@@ -2190,7 +2163,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!root)
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2425,7 +2407,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- if (!block_group->chunk_item_inserted) {
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2494,7 +2477,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2519,14 +2501,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
btrfs_free_excluded_extents(cache);
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
@@ -2547,12 +2521,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, cache->zone_unusable,
- cache->zone_is_active, &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
@@ -2869,7 +2848,7 @@ again:
cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- cache_size);
+ cache_size, false);
if (ret)
goto out_put;
@@ -3965,35 +3944,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
}
}
@@ -4129,7 +4097,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
- block_group->removed);
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
spin_unlock(&block_group->lock);
if (cleanup) {
@@ -4150,7 +4118,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ btrfs_remove_free_space_cache(block_group);
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 6b3cdc4cbc41..8fb14b99a1d1 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
+};
+
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
- u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -95,23 +120,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
+ unsigned long runtime_flags;
unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
- unsigned int to_copy:1;
- unsigned int relocating_repair:1;
- unsigned int chunk_item_inserted:1;
- unsigned int zone_is_active:1;
- unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -305,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 06be0644dd37..ec96285357e0 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
- else if (block_rsv != global_rsv && !delayed_rsv->full)
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
@@ -424,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0c183709be00..578c3497a455 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -92,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b160b8e124e0..54c2ccb36b61 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -65,6 +65,8 @@ enum {
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
@@ -94,7 +96,8 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
- struct extent_io_tree io_failure_tree;
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
/*
* Keep track of where the inode has extent items mapped in order to
@@ -250,11 +253,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
-static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
-{
- return inode->root->fs_info->sectorsize;
-}
-
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
- unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
- __insert_inode_hash(inode, h);
-}
-
#if BITS_PER_LONG == 32
/*
@@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
-
- if (root == root->fs_info->tree_root &&
- btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
- return true;
-
- return false;
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e84d22c5c6a8..f1f051ad3147 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -8,6 +8,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/time.h>
@@ -15,6 +16,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
@@ -152,9 +154,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
- if (cb->status != BLK_STS_OK)
- cb->orig_bio->bi_status = cb->status;
- bio_endio(cb->orig_bio);
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
@@ -166,16 +166,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
* before decompressing it into the original bio and freeing the uncompressed
* pages.
*/
-static void end_compressed_bio_read(struct bio *bio)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
+ struct compressed_bio *cb = bbio->private;
struct inode *inode = cb->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *bi = BTRFS_I(inode);
bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
- blk_status_t status = bio->bi_status;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status = bbio->bio.bi_status;
struct bvec_iter iter;
struct bio_vec bv;
u32 offset;
@@ -186,9 +185,8 @@ static void end_compressed_bio_read(struct bio *bio)
if (!status &&
(!csum || !btrfs_check_data_csum(inode, bbio, offset,
bv.bv_page, bv.bv_offset))) {
- clean_io_failure(fs_info, &bi->io_failure_tree,
- &bi->io_tree, start, bv.bv_page,
- btrfs_ino(bi), bv.bv_offset);
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
} else {
int ret;
@@ -209,7 +207,7 @@ static void end_compressed_bio_read(struct bio *bio)
if (refcount_dec_and_test(&cb->pending_ios))
finish_compressed_bio_read(cb);
btrfs_bio_free_csum(bbio);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
/*
@@ -222,8 +220,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
- struct page *pages[16];
- unsigned long nr_pages = end_index - index + 1;
+ struct folio_batch fbatch;
const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
@@ -231,24 +228,23 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (errno)
mapping_set_error(inode->i_mapping, errno);
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- nr_pages -= 1;
- index += 1;
- continue;
- }
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
+
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (errno)
- SetPageError(pages[i]);
- btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
cb->start, cb->len);
- put_page(pages[i]);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
}
/* the inode may be gone now */
}
@@ -301,20 +297,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio)
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
+ struct compressed_bio *cb = bbio->private;
- if (bio->bi_status)
- cb->status = bio->bi_status;
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
if (refcount_dec_and_test(&cb->pending_ios)) {
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
- bio_put(bio);
+ bio_put(&bbio->bio);
}
/*
@@ -335,7 +331,8 @@ static void end_compressed_bio_write(struct bio *bio)
static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
- blk_opf_t opf, bio_end_io_t endio_func,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
u64 *next_stripe_start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
@@ -344,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
-
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bio->bi_opf = opf;
- bio->bi_private = cb;
- bio->bi_end_io = endio_func;
em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
if (IS_ERR(em)) {
@@ -478,8 +471,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
break;
}
}
@@ -519,7 +511,8 @@ static u64 bio_end_offset(struct bio *bio)
*/
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
- struct compressed_bio *cb)
+ struct compressed_bio *cb,
+ unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
@@ -588,6 +581,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
+ if (PageWorkingset(page))
+ psi_memstall_enter(pflags);
+
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -596,7 +592,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
- lock_extent(tree, cur, page_end);
+ lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
@@ -610,7 +606,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -630,7 +626,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -674,6 +670,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
+ /* Initialize to 1 to make skip psi_memstall_leave unless needed */
+ unsigned long pflags = 1;
blk_status_t ret;
int ret2;
int i;
@@ -729,7 +727,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail;
}
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
@@ -799,8 +797,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
break;
}
@@ -810,6 +807,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
}
+ if (!pflags)
+ psi_memstall_leave(&pflags);
+
if (refcount_dec_and_test(&cb->pending_ios))
finish_compressed_bio_read(cb);
return;
@@ -826,8 +826,7 @@ fail:
kfree(cb);
out:
free_extent_map(em);
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ebfa35fe1c38..b39b339fbf96 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1447,6 +1447,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
@@ -1467,6 +1472,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
if (unlock_up) {
@@ -1634,7 +1641,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -1910,6 +1923,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -1936,7 +1956,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
- down_read(&fs_info->commit_root_sem);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
again:
@@ -2082,7 +2107,15 @@ cow_done:
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- btrfs_tree_read_lock(b);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(b);
+ }
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2131,6 +2164,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -4432,6 +4466,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4612,6 +4647,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
+ ASSERT(!path->nowait);
+
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index df8c99c99df9..727595eee973 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,7 +42,6 @@ struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
@@ -50,6 +49,11 @@ struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -280,14 +284,9 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
- /* Extent tree v2 */
- __le64 block_group_root;
- __le64 block_group_root_generation;
- u8 block_group_root_level;
-
/* future expansion */
- u8 reserved8[7];
- __le64 reserved[25];
+ u8 reserved8[8];
+ __le64 reserved[27];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -307,7 +306,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
- BTRFS_FEATURE_COMPAT_RO_VERITY)
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -443,9 +443,10 @@ struct btrfs_path {
* header (ie. sizeof(struct btrfs_item) is not included).
*/
unsigned int search_for_extension:1;
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
+
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
time64_t time_started; /* seconds since 1-Jan-1970 */
@@ -502,21 +503,6 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
-enum btrfs_caching_type {
- BTRFS_CACHE_NO,
- BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FINISHED,
- BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
@@ -548,42 +534,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
- struct rb_node node;
- void *ptr;
- struct inode *inode;
- /*
- * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
- * points to a struct btrfs_device.
- */
- bool is_block_group;
- /*
- * Only used when 'is_block_group' is true and it is the number of
- * extents used by a swapfile for this block group ('ptr' field).
- */
- int bg_extent_count;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
@@ -890,6 +840,7 @@ struct btrfs_fs_info {
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
@@ -1005,6 +956,7 @@ struct btrfs_fs_info {
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
+ u8 qgroup_drop_subtree_thres;
/* filesystem state */
unsigned long fs_state;
@@ -1092,6 +1044,23 @@ struct btrfs_fs_info {
/* Updates are not protected by any lock */
struct btrfs_commit_stats commit_stats;
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1099,7 +1068,6 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
- struct kobject *discard_debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
@@ -1107,12 +1075,85 @@ struct btrfs_fs_info {
#endif
};
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/*
* The state of btrfs root
*/
enum {
@@ -1174,6 +1215,82 @@ enum {
BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
@@ -2391,17 +2508,6 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/*
- * For extent tree v2 we overload the extent root with the block group root, as
- * we will have multiple extent roots.
- */
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
- struct btrfs_root_backup, extent_root_level, 8);
-
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2534,13 +2640,6 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
- block_group_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
- struct btrfs_super_block,
- block_group_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
- block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2761,45 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums,
- u64 offset)
-{
- u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
-
- return csums + offset_in_sectors * fs_info->csum_size;
-}
-
-/*
- * Take the number of bytes to be checksummed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
- const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
- return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
@@ -3257,12 +3317,9 @@ int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -3273,7 +3330,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit);
+ struct list_head *list, int search_commit,
+ bool nowait);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
@@ -3299,11 +3357,9 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u64 start, u64 end);
int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict);
+ u64 *ram_bytes, bool nowait, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -3358,7 +3414,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
@@ -3439,15 +3494,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op);
-
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3457,8 +3503,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
@@ -3478,8 +3522,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes);
+ size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3745,7 +3791,7 @@ const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, bool first_hit);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
@@ -3753,9 +3799,11 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
+ bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
+ first = true; \
if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
@@ -3767,7 +3815,7 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno)); \
+ __LINE__, (errno), first); \
} while (0)
#ifdef CONFIG_PRINTK_INDEX
@@ -3984,16 +4032,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
- struct btrfs_full_stripe_locks_tree *locks_root)
-{
- locks_root->root = RB_ROOT;
- mutex_init(&locks_root->lock);
-}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
@@ -4020,6 +4061,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
encryption, 8);
@@ -4037,6 +4079,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
return 0;
}
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
#endif
/* Sanity test specific functions */
@@ -4053,24 +4101,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
- return fs_info->zone_size > 0;
-}
-
-/*
- * Count how many fs_info->max_extent_size cover the @size
- */
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
-{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (!fs_info)
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-#endif
-
- return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
-}
-
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 1e8f17ff829e..118b2e20b2e1 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
@@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
@@ -454,7 +461,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
{
int ret;
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 28bf5c3ef430..e07d46043455 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -7,7 +7,8 @@ struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e7f34871a132..cac5169eaf8d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -302,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
{
struct btrfs_delayed_item *item;
+
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
if (item) {
item->data_len = data_len;
- item->ins_or_del = 0;
+ item->type = type;
item->bytes_reserved = 0;
- item->delayed_node = NULL;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
refcount_set(&item->refs, 1);
}
return item;
@@ -319,72 +325,32 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
/*
* __btrfs_lookup_delayed_item - look up the delayed item by key
* @delayed_node: pointer to the delayed node
- * @key: the key to look up
- * @prev: used to store the prev item if the right item isn't found
- * @next: used to store the next item if the right item isn't found
+ * @index: the dir index value to lookup (offset of a dir index key)
*
* Note: if we don't find the right item, we will return the prev item and
* the next item.
*/
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
- struct btrfs_key *key,
- struct btrfs_delayed_item **prev,
- struct btrfs_delayed_item **next)
+ u64 index)
{
- struct rb_node *node, *prev_node = NULL;
+ struct rb_node *node = root->rb_node;
struct btrfs_delayed_item *delayed_item = NULL;
- int ret = 0;
-
- node = root->rb_node;
while (node) {
delayed_item = rb_entry(node, struct btrfs_delayed_item,
rb_node);
- prev_node = node;
- ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
- if (ret < 0)
+ if (delayed_item->index < index)
node = node->rb_right;
- else if (ret > 0)
+ else if (delayed_item->index > index)
node = node->rb_left;
else
return delayed_item;
}
- if (prev) {
- if (!prev_node)
- *prev = NULL;
- else if (ret < 0)
- *prev = delayed_item;
- else if ((node = rb_prev(prev_node)) != NULL) {
- *prev = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *prev = NULL;
- }
-
- if (next) {
- if (!prev_node)
- *next = NULL;
- else if (ret > 0)
- *next = delayed_item;
- else if ((node = rb_next(prev_node)) != NULL) {
- *next = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *next = NULL;
- }
return NULL;
}
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
- NULL, NULL);
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct btrfs_delayed_item *ins)
{
@@ -392,15 +358,13 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
struct btrfs_delayed_item *item;
- int cmp;
bool leftmost = true;
- if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
- root = &delayed_node->del_root;
else
- BUG();
+ root = &delayed_node->del_root;
+
p = &root->rb_root.rb_node;
node = &ins->rb_node;
@@ -409,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
item = rb_entry(parent_node, struct btrfs_delayed_item,
rb_node);
- cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0) {
+ if (item->index < ins->index) {
p = &(*p)->rb_right;
leftmost = false;
- } else if (cmp > 0) {
+ } else if (item->index > ins->index) {
p = &(*p)->rb_left;
} else {
return -EEXIST;
@@ -422,14 +385,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
- ins->delayed_node = delayed_node;
-
- /* Delayed items are always for dir index items. */
- ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY);
- if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM &&
- ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
@@ -451,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
- /* Not associated with any delayed_node */
- if (!delayed_item->delayed_node)
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
+
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
- BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
- delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
- if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_item->delayed_node->ins_root;
else
root = &delayed_item->delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -520,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 num_bytes;
int ret;
@@ -545,14 +503,14 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid,
+ item->delayed_node->inode_id,
num_bytes, 1);
/*
* For insertions we track reserved metadata space by accounting
* for the number of leaves that will be used, based on the delayed
* node's index_items_size field.
*/
- if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
item->bytes_reserved = num_bytes;
}
@@ -574,8 +532,8 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
* to release/reserve qgroup space.
*/
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, item->bytes_reserved,
- 0);
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
@@ -688,6 +646,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
int total_size;
char *ins_data = NULL;
int ret;
@@ -716,9 +676,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
ASSERT(first_item->bytes_reserved == 0);
list_add_tail(&first_item->tree_list, &item_list);
- batch.total_data_size = first_item->data_len;
+ batch.total_data_size = first_data_size;
batch.nr = 1;
- total_size = first_item->data_len + sizeof(struct btrfs_item);
+ total_size = first_data_size + sizeof(struct btrfs_item);
curr = first_item;
while (true) {
@@ -732,8 +692,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
* We cannot allow gaps in the key space if we're doing log
* replay.
*/
- if (continuous_keys_only &&
- (next->key.offset != curr->key.offset + 1))
+ if (continuous_keys_only && (next->index != curr->index + 1))
break;
ASSERT(next->bytes_reserved == 0);
@@ -750,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
}
if (batch.nr == 1) {
- batch.keys = &first_item->key;
- batch.data_sizes = &first_item->data_len;
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
} else {
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -768,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
list_for_each_entry(curr, &item_list, tree_list) {
- ins_keys[i] = curr->key;
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
ins_sizes[i] = curr->data_len;
i++;
}
@@ -864,6 +828,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ const u64 ino = item->delayed_node->inode_id;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
struct extent_buffer *leaf = path->nodes[0];
@@ -902,7 +867,9 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
slot++;
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_comp_cpu_keys(&next->key, &key) != 0)
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
break;
nitems++;
curr = next;
@@ -920,9 +887,8 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
* Check btrfs_delayed_item_reserve_metadata() to see why we
* don't need to release/reserve qgroup space.
*/
- trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, total_reserved_size,
- 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
total_reserved_size, NULL);
}
@@ -940,8 +906,12 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
+ struct btrfs_key key;
int ret = 0;
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
while (ret == 0) {
struct btrfs_delayed_item *item;
@@ -952,7 +922,8 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
break;
}
- ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
/*
* There's no matching item in the leaf. This means we
@@ -1457,16 +1428,15 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
- delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
if (!delayed_item) {
ret = -ENOMEM;
goto release_node;
}
- delayed_item->key.objectid = btrfs_ino(dir);
- delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
- delayed_item->key.offset = index;
- delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM;
+ delayed_item->index = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1490,8 +1460,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
}
if (reserve_leaf_space) {
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root,
- delayed_item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
/*
* Space was reserved for a dir index item insertion when we
* started the transaction, so getting a failure here should be
@@ -1538,12 +1507,12 @@ release_node:
static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
- struct btrfs_key *key)
+ u64 index)
{
struct btrfs_delayed_item *item;
mutex_lock(&node->mutex);
- item = __btrfs_lookup_delayed_insertion_item(node, key);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
return 1;
@@ -1589,32 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
- struct btrfs_key item_key;
int ret;
node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(node))
return PTR_ERR(node);
- item_key.objectid = btrfs_ino(dir);
- item_key.type = BTRFS_DIR_INDEX_KEY;
- item_key.offset = index;
-
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
- &item_key);
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
if (!ret)
goto end;
- item = btrfs_alloc_delayed_item(0);
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
ret = -ENOMEM;
goto end;
}
- item->key = item_key;
- item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM;
+ item->index = index;
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1743,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int ret = 0;
list_for_each_entry(curr, del_list, readdir_list) {
- if (curr->key.offset > index)
+ if (curr->index > index)
break;
- if (curr->key.offset == index) {
+ if (curr->index == index) {
ret = 1;
break;
}
@@ -1779,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < ctx->pos) {
+ if (curr->index < ctx->pos) {
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- ctx->pos = curr->key.offset;
+ ctx->pos = curr->index;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -2085,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
}
}
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 9795dc295a18..0163ca637a96 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM 1
-#define BTRFS_DELAYED_DELETION_ITEM 2
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
struct btrfs_delayed_root {
spinlock_t lock;
@@ -73,14 +74,27 @@ struct btrfs_delayed_node {
struct btrfs_delayed_item {
struct rb_node rb_node;
- struct btrfs_key key;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
- int ins_or_del;
- u32 data_len;
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
char data[];
};
@@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 41cddd3ff059..61e58066b5fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -545,10 +545,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (!cache)
continue;
- spin_lock(&cache->lock);
- cache->to_copy = 1;
- spin_unlock(&cache->lock);
-
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
}
if (iter_ret < 0)
@@ -577,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
return true;
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
return true;
}
@@ -610,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
/* Last stripe on this device */
- spin_lock(&cache->lock);
- cache->to_copy = 0;
- spin_unlock(&cache->lock);
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
return true;
}
@@ -1288,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
- percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 3911049a5f23..6084b313056a 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2633137c3e9f..a2da9313c694 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -131,8 +131,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
@@ -145,8 +144,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -647,16 +646,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->inode;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -757,6 +754,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
bio->bi_opf |= REQ_META;
@@ -776,8 +774,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
ret = btree_csum_one_bio(bio);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(bbio, ret);
return;
}
@@ -1524,6 +1521,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
struct btrfs_root *root = btrfs_global_root(fs_info, &key);
@@ -1980,14 +1980,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
- btrfs_set_backup_block_group_root(root_backup,
- info->block_group_root->node->start);
- btrfs_set_backup_block_group_root_gen(root_backup,
- btrfs_header_generation(info->block_group_root->node));
- btrfs_set_backup_block_group_root_level(root_backup,
- btrfs_header_level(info->block_group_root->node));
- } else {
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
@@ -2225,6 +2218,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2238,8 +2233,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO, NULL);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -2247,7 +2241,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
BTRFS_I(inode)->location.type = 0;
BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2266,6 +2260,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2529,10 +2524,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
return ret;
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2600,8 +2609,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2703,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2785,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2799,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -2860,17 +2881,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
-
- if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
-
- bytenr = btrfs_super_block_group_root(sb);
- gen = btrfs_super_block_group_root_generation(sb);
- level = btrfs_super_block_group_root_level(sb);
- ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
- if (ret)
- btrfs_warn(fs_info, "couldn't read block group root");
- return ret;
+ return 0;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -2882,16 +2893,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- struct btrfs_root *root;
-
- root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
- GFP_KERNEL);
- if (!root)
- return -ENOMEM;
- fs_info->block_group_root = root;
- }
-
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
@@ -2990,6 +2991,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3279,6 +3293,112 @@ out:
return ret;
}
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -3428,72 +3548,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
-
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_alloc;
- }
-
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
-
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
- /*
- * We have unsupported RO compat features, although RO mounted, we
- * should not cause any metadata write, including log replay.
- * Or we could screw up whatever the new feature requires.
- */
- if (unlikely(features && btrfs_super_log_root(disk_super) &&
- !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
- btrfs_err(fs_info,
-"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
- features);
- err = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
goto fail_alloc;
}
-
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
@@ -3833,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio)
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num)
+ int copy_num, bool drop_cache)
{
struct btrfs_super_block *super;
struct page *page;
@@ -3851,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -3882,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i);
+ super = btrfs_read_dev_one_super(bdev, i, false);
if (IS_ERR(super))
continue;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 47ad8e0a2d33..c67c15d4d20b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,10 +46,13 @@ int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num);
+ int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
@@ -103,7 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000000..83cb0378096f
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1674 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->private_data = private_data;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data, state,
+ other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->private_data)
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+ NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set. This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+ GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ cached_state, NULL, GFP_NOFS);
+ while (err == -EEXIST) {
+ if (failed_start != start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached_state);
+
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, cached_state, NULL,
+ GFP_NOFS);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index c3eb52dbe61c..a855f40dd61d 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,7 +17,6 @@ struct io_failure_record;
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
@@ -35,10 +34,18 @@ struct io_failure_record;
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
- EXTENT_ADD_INODE_BYTES)
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -56,7 +63,6 @@ enum {
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -70,8 +76,6 @@ struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -89,33 +93,23 @@ struct extent_state {
refcount_t refs;
u32 state;
- struct io_failure_record *failrec;
-
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
@@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+ return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, unsigned exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits);
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
- mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
+ EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
- GFP_NOFS, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- cached_state, mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6914cd8024ba..cd2d36580f1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2220,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
@@ -2686,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- down_read(&fs_info->commit_root_sem);
- if (start < cache->last_byte_to_unpin && return_free_space) {
- u64 add_len = min(len, cache->last_byte_to_unpin - start);
-
- btrfs_add_free_space(cache, start, add_len);
- }
- up_read(&fs_info->commit_root_sem);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
@@ -3804,7 +3805,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
+ if (block_group->ro ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
ret = 1;
goto out;
}
@@ -3881,7 +3883,7 @@ out:
* regular extents) at the same time to the same zone, which
* easily break the write pointer.
*/
- block_group->zoned_data_reloc_ongoing = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
fs_info->data_reloc_bg = 0;
}
spin_unlock(&fs_info->relocation_bg_lock);
@@ -4888,6 +4890,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
!test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
/*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
@@ -5639,6 +5644,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5798,6 +5805,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
@@ -5832,7 +5842,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
@@ -5864,6 +5874,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cf4f19e80e2f..4dcf22e051ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -31,38 +31,27 @@
#include "block-group.h"
#include "compression.h"
-static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
- return !RB_EMPTY_NODE(&state->rb_node);
-}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(states);
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline void btrfs_leak_debug_add(spinlock_t *lock,
- struct list_head *new,
- struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_add(new, head);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline void btrfs_leak_debug_del(spinlock_t *lock,
- struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_del(entry);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
@@ -91,53 +80,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
- struct extent_state *state;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
- state->start, state->end, state->state,
- extent_state_in_tree(state),
- refcount_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct extent_io_tree *tree, u64 start, u64 end)
-{
- struct inode *inode = tree->private_data;
- u64 isize;
-
- if (!inode || !is_data_inode(inode))
- return;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
#else
-#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
-#define btrfs_leak_debug_del(lock, entry) do {} while (0)
-#define btrfs_extent_state_leak_debug_check() do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
#endif
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
-
/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
@@ -148,6 +95,7 @@ struct btrfs_bio_ctrl {
enum btrfs_compression_type compress_type;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
+ btrfs_bio_end_io_t end_io_func;
};
struct extent_page_data {
@@ -161,24 +109,6 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, u32 bits,
- struct extent_changeset *changeset,
- int set)
-{
- int ret;
-
- if (!changeset)
- return 0;
- if (set && (state->state & bits) == bits)
- return 0;
- if (!set && (state->state & bits) == 0)
- return 0;
- changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
-}
-
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct bio *bio;
@@ -207,7 +137,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
btrfs_submit_data_read_bio(inode, bio, mirror_num,
bio_ctrl->compress_type);
- /* The bio is owned by the bi_end_io handler now */
+ /* The bio is owned by the end_io handler now */
bio_ctrl->bio = NULL;
}
@@ -223,26 +153,15 @@ static void submit_write_bio(struct extent_page_data *epd, int ret)
if (ret) {
ASSERT(ret < 0);
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- /* The bio is owned by the bi_end_io handler now */
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
epd->bio_ctrl.bio = NULL;
} else {
submit_one_bio(&epd->bio_ctrl);
}
}
-int __init extent_state_cache_init(void)
-{
- extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!extent_state_cache)
- return -ENOMEM;
- return 0;
-}
-
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
@@ -250,32 +169,10 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
return -ENOMEM;
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_bio, bio),
- BIOSET_NEED_BVECS))
- goto free_buffer_cache;
-
- if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
- goto free_bioset;
-
return 0;
-
-free_bioset:
- bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
- kmem_cache_destroy(extent_buffer_cache);
- extent_buffer_cache = NULL;
- return -ENOMEM;
-}
-
-void __cold extent_state_cache_exit(void)
-{
- btrfs_extent_state_leak_debug_check();
- kmem_cache_destroy(extent_state_cache);
}
-void __cold extent_io_exit(void)
+void __cold extent_buffer_free_cachep(void)
{
/*
* Make sure all delayed rcu free are flushed before we
@@ -283,1244 +180,6 @@ void __cold extent_io_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
- bioset_exit(&btrfs_bioset);
-}
-
-/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
- */
-static struct lock_class_key file_extent_tree_class;
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
-{
- tree->fs_info = fs_info;
- tree->state = RB_ROOT;
- tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
- tree->private_data = private_data;
- tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
- struct extent_state *state;
-
- /*
- * The given mask might be not appropriate for the slab allocator,
- * drop the unsupported bits
- */
- mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
- state = kmem_cache_alloc(extent_state_cache, mask);
- if (!state)
- return state;
- state->state = 0;
- state->failrec = NULL;
- RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
- refcount_set(&state->refs, 1);
- init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
- return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
- if (!state)
- return;
- if (refcount_dec_and_test(&state->refs)) {
- WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&leak_lock, &state->leak_list);
- trace_free_extent_state(state, _RET_IP_);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-/**
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @node_ret: pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret: points to entry which would have been the parent of the entry,
- * containing @offset
- *
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
- *
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
- */
-static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***node_ret,
- struct rb_node **parent_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **node = &root->rb_node;
- struct rb_node *prev = NULL;
- struct tree_entry *entry;
-
- while (*node) {
- prev = *node;
- entry = rb_entry(prev, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- node = &(*node)->rb_left;
- else if (offset > entry->end)
- node = &(*node)->rb_right;
- else
- return *node;
- }
-
- if (node_ret)
- *node_ret = node;
- if (parent_ret)
- *parent_ret = prev;
-
- /* Search neighbors until we find the first one past the end */
- while (prev && offset > entry->end) {
- prev = rb_next(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
-
- return prev;
-}
-
-/*
- * Inexact rb-tree search, return the next entry if @offset is not found
- */
-static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
-{
- return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/**
- * Search offset in the tree or fill neighbor rbtree node pointers.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- *
- * Return a pointer to the entry that contains @offset byte address. If no
- * such entry exists, then return NULL and fill @prev_ret and @next_ret.
- * Otherwise return the found entry and other pointers are left untouched.
- */
-static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **node = &root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
- struct tree_entry *entry;
-
- ASSERT(prev_ret);
- ASSERT(next_ret);
-
- while (*node) {
- prev = *node;
- entry = rb_entry(prev, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- node = &(*node)->rb_left;
- else if (offset > entry->end)
- node = &(*node)->rb_right;
- else
- return *node;
- }
-
- orig_prev = prev;
- while (prev && offset > entry->end) {
- prev = rb_next(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
-
- entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < entry->start) {
- prev = rb_prev(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
-
- return NULL;
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
-{
- struct extent_state *other;
- struct rb_node *other_node;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- return;
-
- other_node = rb_prev(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
- other_node = rb_next(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 bits,
- struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree. 'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally. This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
-{
- struct rb_node **node;
- struct rb_node *parent;
- const u64 end = state->end;
-
- set_state_bits(tree, state, bits, changeset);
-
- node = &tree->state.rb_node;
- while (*node) {
- struct tree_entry *entry;
-
- parent = *node;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (end < entry->start) {
- node = &(*node)->rb_left;
- } else if (end > entry->end) {
- node = &(*node)->rb_right;
- } else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, end);
- return -EEXIST;
- }
- }
-
- rb_link_node(&state->rb_node, parent, node);
- rb_insert_color(&state->rb_node, &tree->state);
-
- merge_state(tree, state);
- return 0;
-}
-
-/*
- * Insert state to @tree to the location given by @node and @parent.
- */
-static void insert_state_fast(struct extent_io_tree *tree,
- struct extent_state *state, struct rb_node **node,
- struct rb_node *parent, unsigned bits,
- struct extent_changeset *changeset)
-{
- set_state_bits(tree, state, bits, changeset);
- rb_link_node(&state->rb_node, parent, node);
- rb_insert_color(&state->rb_node, &tree->state);
- merge_state(tree, state);
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half. 'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end]. After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
- struct extent_state *prealloc, u64 split)
-{
- struct rb_node *parent = NULL;
- struct rb_node **node;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
- prealloc->start = orig->start;
- prealloc->end = split - 1;
- prealloc->state = orig->state;
- orig->start = split;
-
- parent = &orig->rb_node;
- node = &parent;
- while (*node) {
- struct tree_entry *entry;
-
- parent = *node;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (prealloc->end < entry->start) {
- node = &(*node)->rb_left;
- } else if (prealloc->end > entry->end) {
- node = &(*node)->rb_right;
- } else {
- free_extent_state(prealloc);
- return -EEXIST;
- }
- }
-
- rb_link_node(&prealloc->rb_node, parent, node);
- rb_insert_color(&prealloc->rb_node, &tree->state);
-
- return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
- struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, int wake,
- struct extent_changeset *changeset)
-{
- struct extent_state *next;
- u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
- int ret;
-
- if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- WARN_ON(range > tree->dirty_bytes);
- tree->dirty_bytes -= range;
- }
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
- ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
- BUG_ON(ret < 0);
- state->state &= ~bits_to_clear;
- if (wake)
- wake_up(&state->wq);
- if (state->state == 0) {
- next = next_state(state);
- if (extent_state_in_tree(state)) {
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
- } else {
- WARN_ON(1);
- }
- } else {
- merge_state(tree, state);
- next = next_state(state);
- }
- return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
-
- return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree. This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *cached;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- u64 last_end;
- int err;
- int clear = 0;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
- if (bits & EXTENT_DELALLOC)
- bits |= EXTENT_NORESERVE;
-
- if (delete)
- bits |= ~EXTENT_CTLBITS;
-
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- clear = 1;
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state) {
- cached = *cached_state;
-
- if (clear) {
- *cached_state = NULL;
- cached_state = NULL;
- }
-
- if (cached && extent_state_in_tree(cached) &&
- cached->start <= start && cached->end > start) {
- if (clear)
- refcount_dec(&cached->refs);
- state = cached;
- goto hit_next;
- }
- if (clear)
- free_extent_state(cached);
- }
- /*
- * this search will find the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- if (state->start > end)
- goto out;
- WARN_ON(state->end < start);
- last_end = state->end;
-
- /* the state doesn't have the wanted bits, go ahead */
- if (!(state->state & bits)) {
- state = next_state(state);
- goto next;
- }
-
- /*
- * | ---- desired range ---- |
- * | state | or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip
- * bits on second half.
- *
- * If the extent we found extends past our range, we
- * just split and search again. It'll get split again
- * the next time though.
- *
- * If the extent we found is inside our range, we clear
- * the desired bit on it.
- */
-
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- state = clear_state_bit(tree, state, bits, wake, changeset);
- goto next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and clear the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- if (wake)
- wake_up(&state->wq);
-
- clear_state_bit(tree, prealloc, bits, wake, changeset);
-
- prealloc = NULL;
- goto out;
- }
-
- state = clear_state_bit(tree, state, bits, wake, changeset);
-next:
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start <= end && state && !need_resched())
- goto hit_next;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
-
- spin_lock(&tree->lock);
-again:
- while (1) {
- /*
- * this search will find all the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
-process_node:
- if (!node)
- break;
-
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (state->start > end)
- goto out;
-
- if (state->state & bits) {
- start = state->start;
- refcount_inc(&state->refs);
- wait_on_state(tree, state);
- free_extent_state(state);
- goto again;
- }
- start = state->end + 1;
-
- if (start > end)
- break;
-
- if (!cond_resched_lock(&tree->lock)) {
- node = rb_next(node);
- goto process_node;
- }
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
-{
- u32 bits_to_set = bits & ~EXTENT_CTLBITS;
- int ret;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
- if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- tree->dirty_bytes += range;
- }
- ret = add_extent_changeset(state, bits_to_set, changeset, 1);
- BUG_ON(ret < 0);
- state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
- struct extent_state **cached_ptr,
- unsigned flags)
-{
- if (cached_ptr && !(*cached_ptr)) {
- if (!flags || (state->state & flags)) {
- *cached_ptr = state;
- refcount_inc(&state->refs);
- }
- }
-}
-
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
-{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- u32 exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
- if (exclusive_bits)
- ASSERT(failed_start);
- else
- ASSERT(failed_start == NULL);
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- prealloc->start = start;
- prealloc->end = end;
- insert_state_fast(tree, prealloc, p, parent, bits, changeset);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- if (state->state & exclusive_bits) {
- *failed_start = state->start;
- err = -EEXIST;
- goto out;
- }
-
- set_state_bits(tree, state, bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- /*
- * If this extent already has all the bits we want set, then
- * skip it, not necessary to split it or do anything with it.
- */
- if ((state->state & bits) == bits) {
- start = state->end + 1;
- cache_state(state, cached_state);
- goto search_again;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- prealloc->start = start;
- prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, bits, changeset);
- cache_state(prealloc, cached_state);
- merge_state(tree, prealloc);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-
-}
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- * another
- * @tree: the io tree to search
- * @start: the start offset in bytes
- * @end: the end offset in bytes (inclusive)
- * @bits: the bits to set in this range
- * @clear_bits: the bits to clear in this range
- * @cached_state: state that we're going to cache
- *
- * This will go through and set bits for the given range. If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits. This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY. This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
- bool first_iteration = true;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
- clear_bits);
-
-again:
- if (!prealloc) {
- /*
- * Best effort, don't worry if extent state allocation fails
- * here for the first iteration. We might have a cached state
- * that matches exactly the target range, in which case no
- * extent state allocations are needed. We'll only know this
- * after locking the tree.
- */
- prealloc = alloc_extent_state(GFP_NOFS);
- if (!prealloc && !first_iteration)
- return -ENOMEM;
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- prealloc->start = start;
- prealloc->end = end;
- insert_state_fast(tree, prealloc, p, parent, bits, NULL);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- prealloc->start = start;
- prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, bits, NULL);
- cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- cond_resched();
- first_iteration = false;
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
-}
-
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
-}
-
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, GFP_NOFS, NULL);
-}
-
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * Don't support EXTENT_LOCKED case, same reason as
- * set_record_extent_bits().
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
- changeset);
-}
-
-/*
- * either insert or lock state struct between start and end use mask to tell
- * us if waiting is desired.
- */
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
-{
- int err;
- u64 failed_start;
-
- while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
- start = failed_start;
- } else
- break;
- WARN_ON(start > end);
- }
- return err;
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- int err;
- u64 failed_start;
-
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL);
- return 0;
- }
- return 1;
}
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
@@ -1554,295 +213,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
}
}
-/* find the first state struct with 'bits' set after 'start', and
- * return it. tree->lock must be held. NULL will returned if
- * nothing was found after 'start'
- */
-static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
-{
- struct rb_node *node;
- struct extent_state *state;
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits))
- return state;
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- return NULL;
-}
-
-/*
- * Find the first offset in the io tree with one or more @bits set.
- *
- * Note: If there are multiple bits set in @bits, any of them will match.
- *
- * Return 0 if we find something, and update @start_ret and @end_ret.
- * Return 1 if we found nothing.
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->end == start - 1 && extent_state_in_tree(state)) {
- while ((state = next_state(state)) != NULL) {
- if (state->state & bits)
- goto got_it;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- goto out;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- }
-
- state = find_first_extent_bit_state(tree, start, bits);
-got_it:
- if (state) {
- cache_state_if_flags(state, cached_state, 0);
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- }
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find a contiguous area of bits
- *
- * @tree: io tree to check
- * @start: offset to start the search from
- * @start_ret: the first offset we found with the bits set
- * @end_ret: the final contiguous range of the bits that were set
- * @bits: bits to look for
- *
- * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
- * to set bits appropriately, and then merge them again. During this time it
- * will drop the tree->lock, so use this helper if you want to find the actual
- * contiguous area for given bits. We will search to the first bit we find, and
- * then walk down the tree until we find a non-contiguous area. The area
- * returned will be the full contiguous area with the bits set.
- */
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- state = find_first_extent_bit_state(tree, start, bits);
- if (state) {
- *start_ret = state->start;
- *end_ret = state->end;
- while ((state = next_state(state)) != NULL) {
- if (state->start > (*end_ret + 1))
- break;
- *end_ret = state->end;
- }
- ret = 0;
- }
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find the first range that has @bits not set. This range could start before
- * @start.
- *
- * @tree: the tree to search
- * @start: offset at/after which the found extent should start
- * @start_ret: records the beginning of the range
- * @end_ret: records the end of the range (inclusive)
- * @bits: the set of bits which must be unset
- *
- * Since unallocated range is also considered one which doesn't have the bits
- * set it's possible that @end_ret contains -1, this happens in case the range
- * spans (last_range_end, end of device]. In this case it's up to the caller to
- * trim @end_ret to the appropriate size.
- */
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node, *prev = NULL, *next;
-
- spin_lock(&tree->lock);
-
- /* Find first extent with bits cleared */
- while (1) {
- node = tree_search_prev_next(tree, start, &prev, &next);
- if (!node && !next && !prev) {
- /*
- * Tree is completely empty, send full range and let
- * caller deal with it
- */
- *start_ret = 0;
- *end_ret = -1;
- goto out;
- } else if (!node && !next) {
- /*
- * We are past the last allocated chunk, set start at
- * the end of the last extent.
- */
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
- } else if (!node) {
- node = next;
- }
- /*
- * At this point 'node' either contains 'start' or start is
- * before 'node'
- */
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (in_range(start, state->start, state->end - state->start + 1)) {
- if (state->state & bits) {
- /*
- * |--range with bits sets--|
- * |
- * start
- */
- start = state->end + 1;
- } else {
- /*
- * 'start' falls within a range that doesn't
- * have the bits set, so take its start as
- * the beginning of the desired range
- *
- * |--range with bits cleared----|
- * |
- * start
- */
- *start_ret = state->start;
- break;
- }
- } else {
- /*
- * |---prev range---|---hole/unset---|---node range---|
- * |
- * start
- *
- * or
- *
- * |---hole/unset--||--first node--|
- * 0 |
- * start
- */
- if (prev) {
- state = rb_entry(prev, struct extent_state,
- rb_node);
- *start_ret = state->end + 1;
- } else {
- *start_ret = 0;
- }
- break;
- }
- }
-
- /*
- * Find the longest stretch from start until an entry which has the
- * bits set
- */
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && !(state->state & bits)) {
- *end_ret = state->end;
- } else {
- *end_ret = state->start - 1;
- break;
- }
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
- *
- * true is returned if we find something, false if nothing was in the tree
- */
-bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
- u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
-{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- bool found = false;
- u64 total_bytes = 0;
-
- spin_lock(&tree->lock);
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node) {
- *end = (u64)-1;
- goto out;
- }
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (found && (state->start != cur_start ||
- (state->state & EXTENT_BOUNDARY))) {
- goto out;
- }
- if (!(state->state & EXTENT_DELALLOC)) {
- if (!found)
- *end = state->end;
- goto out;
- }
- if (!found) {
- *start = state->start;
- *cached_state = state;
- refcount_inc(&state->refs);
- }
- found = true;
- *end = state->end;
- cur_start = state->end + 1;
- node = rb_next(node);
- total_bytes += state->end - state->start + 1;
- if (total_bytes >= max_bytes)
- break;
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return found;
-}
-
/*
* Process one page for __process_pages_contig().
*
@@ -1900,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping,
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
- unsigned long nr_pages = end_index - start_index + 1;
unsigned long pages_processed = 0;
- struct page *pages[16];
+ struct folio_batch fbatch;
int err = 0;
int i;
@@ -1911,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping,
ASSERT(processed_end && *processed_end == start);
}
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
mapping_set_error(mapping, -EIO);
- while (nr_pages > 0) {
- int found_pages;
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ int found_folios;
+
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
- found_pages = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (found_pages == 0) {
+ if (found_folios == 0) {
/*
* Only if we're going to lock these pages, we can find
* nothing at @index.
@@ -1930,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping,
goto out;
}
- for (i = 0; i < found_pages; i++) {
+ for (i = 0; i < found_folios; i++) {
int process_ret;
-
+ struct folio *folio = fbatch.folios[i];
process_ret = process_one_page(fs_info, mapping,
- pages[i], locked_page, page_ops,
+ &folio->page, locked_page, page_ops,
start, end);
if (process_ret < 0) {
- for (; i < found_pages; i++)
- put_page(pages[i]);
err = -EAGAIN;
+ folio_batch_release(&fbatch);
goto out;
}
- put_page(pages[i]);
- pages_processed++;
+ pages_processed += folio_nr_pages(folio);
}
- nr_pages -= found_pages;
- index += found_pages;
+ folio_batch_release(&fbatch);
cond_resched();
}
out:
@@ -2094,14 +461,14 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
+ lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state);
+ unlock_extent(tree, delalloc_start, delalloc_end,
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -2118,210 +485,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops, NULL);
}
-/*
- * count the number of bytes in the tree that have a given bit(s)
- * set. This can be fairly slow, except for EXTENT_DIRTY which is
- * cached. The total number found is returned.
- */
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig)
+static int insert_failrec(struct btrfs_inode *inode,
+ struct io_failure_record *failrec)
{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- u64 total_bytes = 0;
- u64 last = 0;
- int found = 0;
-
- if (WARN_ON(search_end <= cur_start))
- return 0;
+ struct rb_node *exist;
- spin_lock(&tree->lock);
- if (cur_start == 0 && bits == EXTENT_DIRTY) {
- total_bytes = tree->dirty_bytes;
- goto out;
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node)
- goto out;
+ spin_lock(&inode->io_failure_lock);
+ exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+ &failrec->rb_node);
+ spin_unlock(&inode->io_failure_lock);
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start > search_end)
- break;
- if (contig && found && state->start > last + 1)
- break;
- if (state->end >= cur_start && (state->state & bits) == bits) {
- total_bytes += min(search_end, state->end) + 1 -
- max(cur_start, state->start);
- if (total_bytes >= max_bytes)
- break;
- if (!found) {
- *start = max(cur_start, state->start);
- found = 1;
- }
- last = state->end;
- } else if (contig && found) {
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return total_bytes;
+ return (exist == NULL) ? 0 : -EEXIST;
}
-/*
- * set the private field for a given byte offset in the tree. If there isn't
- * an extent_state there already, this does nothing.
- */
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
{
struct rb_node *node;
- struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec = ERR_PTR(-ENOENT);
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- ret = -ENOENT;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- ret = -ENOENT;
- goto out;
- }
- state->failrec = failrec;
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
-{
- struct rb_node *node;
- struct extent_state *state;
- struct io_failure_record *failrec;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
-
- failrec = state->failrec;
-out:
- spin_unlock(&tree->lock);
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search(&inode->io_failure_tree, start);
+ if (node)
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ spin_unlock(&inode->io_failure_lock);
return failrec;
}
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if every extent in the tree
- * has the bits set. Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
-{
- struct extent_state *state = NULL;
- struct rb_node *node;
- int bitset = 0;
-
- spin_lock(&tree->lock);
- if (cached && extent_state_in_tree(cached) && cached->start <= start &&
- cached->end > start)
- node = &cached->rb_node;
- else
- node = tree_search(tree, start);
- while (node && start <= end) {
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (filled && state->start > start) {
- bitset = 0;
- break;
- }
-
- if (state->start > end)
- break;
-
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
- break;
- }
-
- if (state->end == (u64)-1)
- break;
-
- start = state->end + 1;
- if (start > end)
- break;
- node = rb_next(node);
- if (!node) {
- if (filled)
- bitset = 0;
- break;
- }
- }
- spin_unlock(&tree->lock);
- return bitset;
-}
-
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec)
+static void free_io_failure(struct btrfs_inode *inode,
+ struct io_failure_record *rec)
{
- int ret;
- int err = 0;
-
- set_state_failrec(failure_tree, rec->start, NULL);
- ret = clear_extent_bits(failure_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret)
- err = ret;
-
- ret = clear_extent_bits(io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED);
- if (ret && !err)
- err = ret;
+ spin_lock(&inode->io_failure_lock);
+ rb_erase(&rec->rb_node, &inode->io_failure_tree);
+ spin_unlock(&inode->io_failure_lock);
kfree(rec);
- return err;
}
/*
@@ -2456,24 +659,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset)
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset)
{
- u64 private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 ino = btrfs_ino(inode);
+ u64 locked_start, locked_end;
struct io_failure_record *failrec;
- struct extent_state *state;
int mirror;
int ret;
- private = 0;
- ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
- EXTENT_DIRTY, 0);
- if (!ret)
- return 0;
-
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(inode, start);
if (IS_ERR(failrec))
return 0;
@@ -2482,14 +679,10 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (sb_rdonly(fs_info->sb))
goto out;
- spin_lock(&io_tree->lock);
- state = find_first_extent_bit_state(io_tree,
- failrec->start,
- EXTENT_LOCKED);
- spin_unlock(&io_tree->lock);
-
- if (!state || state->start > failrec->start ||
- state->end < failrec->start + failrec->len - 1)
+ ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+ &locked_end, EXTENT_LOCKED, NULL);
+ if (ret || locked_start > failrec->bytenr ||
+ locked_end < failrec->bytenr + failrec->len - 1)
goto out;
mirror = failrec->this_mirror;
@@ -2500,7 +693,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
} while (mirror != failrec->failed_mirror);
out:
- free_io_failure(failure_tree, io_tree, failrec);
+ free_io_failure(inode, failrec);
return 0;
}
@@ -2512,30 +705,26 @@ out:
*/
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
- struct extent_state *state, *next;
+ struct rb_node *node, *next;
- if (RB_EMPTY_ROOT(&failure_tree->state))
+ if (RB_EMPTY_ROOT(&inode->io_failure_tree))
return;
- spin_lock(&failure_tree->lock);
- state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
- while (state) {
- if (state->start > end)
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search_first(&inode->io_failure_tree, start);
+ while (node) {
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ if (failrec->bytenr > end)
break;
- ASSERT(state->end <= end);
-
- next = next_state(state);
-
- failrec = state->failrec;
- free_extent_state(state);
+ next = rb_next(node);
+ rb_erase(&failrec->rb_node, &inode->io_failure_tree);
kfree(failrec);
- state = next;
+ node = next;
}
- spin_unlock(&failure_tree->lock);
+ spin_unlock(&inode->io_failure_lock);
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
@@ -2545,16 +734,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u32 sectorsize = fs_info->sectorsize;
int ret;
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(BTRFS_I(inode), start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
- failrec->logical, failrec->start, failrec->len);
+ failrec->logical, failrec->bytenr, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
@@ -2569,7 +756,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
if (!failrec)
return ERR_PTR(-ENOMEM);
- failrec->start = start;
+ RB_CLEAR_NODE(&failrec->rb_node);
+ failrec->bytenr = start;
failrec->len = sectorsize;
failrec->failed_mirror = bbio->mirror_num;
failrec->this_mirror = bbio->mirror_num;
@@ -2594,14 +782,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
}
/* Set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0) {
- ret = set_state_failrec(failure_tree, start, failrec);
- /* Set the bits in the inode's tree */
- ret = set_extent_bits(tree, start, start + sectorsize - 1,
- EXTENT_DAMAGED);
- } else if (ret < 0) {
+ ret = insert_failrec(BTRFS_I(inode), failrec);
+ if (ret) {
kfree(failrec);
return ERR_PTR(ret);
}
@@ -2616,8 +798,6 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
u64 start = failed_bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *failed_bio = &failed_bbio->bio;
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
@@ -2646,17 +826,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
btrfs_debug(fs_info,
"failed to repair num_copies %d this_mirror %d failed_mirror %d",
failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
- free_io_failure(failure_tree, tree, failrec);
+ free_io_failure(BTRFS_I(inode), failrec);
return -EIO;
}
- repair_bio = btrfs_bio_alloc(1);
+ repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+ failed_bbio->private);
repair_bbio = btrfs_bio(repair_bio);
repair_bbio->file_offset = start;
- repair_bio->bi_opf = REQ_OP_READ;
- repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
- repair_bio->bi_private = failed_bio->bi_private;
if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
@@ -2720,8 +898,8 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
if (uptodate)
set_extent_uptodate(&inode->io_tree, offset,
offset + sectorsize - 1, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(&inode->io_tree, offset,
- offset + sectorsize - 1, &cached);
+ unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
+ &cached);
}
static void submit_data_read_repair(struct inode *inode,
@@ -2823,8 +1001,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio)
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
@@ -2924,11 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
* Now we don't have range contiguous to the processed range, release
* the processed range now.
*/
- if (processed->uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, processed->start, processed->end,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, processed->start, processed->end,
- &cached);
+ unlock_extent_atomic(tree, processed->start, processed->end, &cached);
update:
/* Update processed to current range */
@@ -2988,11 +1163,10 @@ static struct extent_buffer *find_extent_buffer_readpage(
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio)
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
* The offset to the beginning of a bio, since one bio can never be
@@ -3019,8 +1193,6 @@ static void end_bio_extent_readpage(struct bio *bio)
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
/*
* We always issue full-sector reads, but if some block in a
@@ -3061,9 +1233,7 @@ static void end_bio_extent_readpage(struct bio *bio)
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start, page,
- btrfs_ino(BTRFS_I(inode)), 0);
+ btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
/*
* Zero out the remaining part if this range straddles
@@ -3162,50 +1332,6 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
-/*
- * Initialize the members up to but not including 'bio'. Use after allocating a
- * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
- * 'bio' because use of __GFP_ZERO is not supported.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio)
-{
- memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-}
-
-/*
- * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
- *
- * The bio allocation is backed by bioset and does not fail.
- */
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio));
- return bio;
-}
-
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
-{
- struct bio *bio;
- struct btrfs_bio *bbio;
-
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
-
- /* this will never fail when it's backed by a bioset */
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- ASSERT(bio);
-
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio);
-
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
- return bio;
-}
-
/**
* Attempt to add a page to bio
*
@@ -3351,7 +1477,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_bio_ctrl *bio_ctrl,
struct writeback_control *wbc,
blk_opf_t opf,
- bio_end_io_t end_io_func,
u64 disk_bytenr, u32 offset, u64 file_offset,
enum btrfs_compression_type compress_type)
{
@@ -3359,7 +1484,9 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ ASSERT(bio_ctrl->end_io_func);
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
@@ -3370,8 +1497,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->compress_type = compress_type;
- bio->bi_end_io = end_io_func;
- bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
@@ -3410,31 +1535,30 @@ static int alloc_new_bio(struct btrfs_inode *inode,
return 0;
error:
bio_ctrl->bio = NULL;
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
return ret;
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
- * @page: page to add to the bio
* @disk_bytenr: logical bytenr where the write will be
+ * @page: page to add to the bio
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @bio_ret: must be valid pointer, newly allocated bio will be stored there
- * @end_io_func: end_io callback for new bio
- * @mirror_num: desired mirror to read/write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @compress_type: compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
*/
static int submit_extent_page(blk_opf_t opf,
struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page, u64 disk_bytenr,
+ u64 disk_bytenr, struct page *page,
size_t size, unsigned long pg_offset,
- bio_end_io_t end_io_func,
enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
@@ -3446,6 +1570,9 @@ static int submit_extent_page(blk_opf_t opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
+
+ ASSERT(bio_ctrl->end_io_func);
+
if (force_bio_submit)
submit_one_bio(bio_ctrl);
@@ -3456,7 +1583,7 @@ static int submit_extent_page(blk_opf_t opf,
/* Allocate new bio if needed */
if (!bio_ctrl->bio) {
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
- end_io_func, disk_bytenr, offset,
+ disk_bytenr, offset,
page_offset(page) + cur,
compress_type);
if (ret < 0)
@@ -3613,7 +1740,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
- u64 cur_end;
struct extent_map *em;
int ret = 0;
size_t pg_offset = 0;
@@ -3623,7 +1749,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = set_page_extent_mapped(page);
if (ret < 0) {
- unlock_extent(tree, start, end);
+ unlock_extent(tree, start, end, NULL);
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
goto out;
@@ -3637,6 +1763,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
+ bio_ctrl->end_io_func = end_bio_extent_readpage;
begin_page_read(fs_info, page);
while (cur <= end) {
unsigned long this_bio_flag = 0;
@@ -3651,15 +1778,14 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, pg_offset, iosize);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR(em)) {
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
ret = PTR_ERR(em);
break;
@@ -3672,7 +1798,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
if (this_bio_flag != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
@@ -3735,43 +1860,31 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end,
- EXTENT_UPTODATE, 1, NULL)) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
- /* we have an inline extent but it didn't get marked up
- * to date. Error out
- */
if (block_start == EXTENT_MAP_INLINE) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- bio_ctrl, page, disk_bytenr, iosize,
- pg_offset, end_bio_extent_readpage,
- this_bio_flag, force_bio_submit);
+ bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset, this_bio_flag,
+ force_bio_submit);
if (ret) {
/*
* We have to unlock the remaining range, or the page
* will never be unlocked.
*/
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
@@ -3984,6 +2097,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
wbc->nr_to_write--;
+ epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
@@ -4077,10 +2191,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
ret = submit_extent_page(op | write_flags, wbc,
- &epd->bio_ctrl, page,
- disk_bytenr, iosize,
+ &epd->bio_ctrl, disk_bytenr,
+ page, iosize,
cur - page_offset(page),
- end_bio_extent_writepage,
0, false);
if (ret) {
has_error = true;
@@ -4431,8 +2544,9 @@ static struct extent_buffer *find_extent_buffer_nolock(
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
-static void end_bio_subpage_eb_writepage(struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
@@ -4488,8 +2602,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
bio_put(bio);
}
-static void end_bio_extent_buffer_writepage(struct bio *bio)
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
@@ -4571,10 +2686,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
+ epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_subpage_eb_writepage, 0, false);
+ &epd->bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4605,6 +2721,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
prepare_eb_write(eb);
+ epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -4612,10 +2730,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, p, disk_bytenr,
- PAGE_SIZE, 0,
- end_bio_extent_buffer_writepage,
- 0, false);
+ &epd->bio_ctrl, disk_bytenr, p,
+ PAGE_SIZE, 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -5236,7 +3352,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, &cached_state);
+ lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -5244,7 +3360,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent_cached(tree, start, end, &cached_state);
+ unlock_extent(tree, start, end, &cached_state);
return 0;
}
@@ -5263,15 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree,
if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
} else {
+ u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+
/*
* At this point we can safely clear everything except the
* locked bit, the nodatasum bit and the delalloc new bit.
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
- 0, 0, NULL, mask, NULL);
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+ mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -5370,42 +3488,6 @@ next:
}
/*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
- u64 offset, u64 last)
-{
- u64 sectorsize = btrfs_inode_sectorsize(inode);
- struct extent_map *em;
- u64 len;
-
- if (offset >= last)
- return NULL;
-
- while (1) {
- len = last - offset;
- if (len == 0)
- break;
- len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR(em))
- return em;
-
- /* if this isn't a hole return it */
- if (em->block_start != EXTENT_MAP_HOLE)
- return em;
-
- /* this is a hole, advance to the next extent */
- offset = extent_map_end(em);
- free_extent_map(em);
- if (offset >= last)
- break;
- }
- return NULL;
-}
-
-/*
* To cache previous fiemap extent
*
* Will be used for merging fiemap extent
@@ -5434,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
{
int ret = 0;
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
if (!cache->cached)
goto assign;
@@ -5457,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* So truly compressed (physical size smaller than logical size)
* extents won't get merged with each other
*
- * 3) Share same flags except FIEMAP_EXTENT_LAST
- * So regular extent won't get merged with prealloc extent
+ * 3) Share same flags
*/
if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
- (cache->flags & ~FIEMAP_EXTENT_LAST) ==
- (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->flags == flags) {
cache->len += len;
- cache->flags |= flags;
- goto try_submit_last;
+ return 0;
}
/* Not mergeable, need to submit cached one */
@@ -5481,13 +3563,8 @@ assign:
cache->phys = phys;
cache->len = len;
cache->flags = flags;
-try_submit_last:
- if (cache->flags & FIEMAP_EXTENT_LAST) {
- ret = fiemap_fill_next_extent(fieinfo, cache->offset,
- cache->phys, cache->len, cache->flags);
- cache->cached = false;
- }
- return ret;
+
+ return 0;
}
/*
@@ -5517,215 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- int ret = 0;
- u64 off;
- u64 max = start + len;
- u32 flags = 0;
- u32 found_type;
- u64 last;
- u64 last_for_get_extent = 0;
- u64 disko = 0;
- u64 isize = i_size_read(&inode->vfs_inode);
- struct btrfs_key found_key;
- struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
- struct btrfs_path *path;
- struct btrfs_root *root = inode->root;
- struct fiemap_cache cache = { 0 };
- struct ulist *roots;
- struct ulist *tmp_ulist;
- int end = 0;
- u64 em_start = 0;
- u64 em_len = 0;
- u64 em_end = 0;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
- if (len == 0)
- return -EINVAL;
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
- path = btrfs_alloc_path();
- if (!path)
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
return -ENOMEM;
- roots = ulist_alloc(GFP_KERNEL);
- tmp_ulist = ulist_alloc(GFP_KERNEL);
- if (!roots || !tmp_ulist) {
- ret = -ENOMEM;
- goto out_free_ulist;
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
}
/*
- * We can't initialize that to 'start' as this could miss extents due
- * to extent item merging
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we call fiemap_fill_next_extent(), because that may cause a page
+ * fault when filling the user space buffer with fiemap data.
*/
- off = 0;
- start = round_down(start, btrfs_inode_sectorsize(inode));
- len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct btrfs_backref_shared_cache *backref_cache,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp_ulist,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
/*
- * lookup the last file extent. We're not using i_size here
- * because there might be preallocation past i_size
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
*/
- ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
- 0);
- if (ret < 0) {
- goto out_free_ulist;
- } else {
- WARN_ON(!ret);
- if (ret == 1)
- ret = 0;
- }
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = found_key.type;
-
- /* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(inode) ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
- /* have to trust i_size as the end */
- last = (u64)-1;
- last_for_get_extent = isize;
- } else {
/*
- * remember the start of the last extent. There are a
- * bunch of different factors that go into the length of the
- * extent, so its much less complex to remember where it started
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
*/
- last = found_key.offset;
- last_for_get_extent = last + 1;
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
}
- btrfs_release_path(path);
/*
- * we might have some extents allocated but more delalloc past those
- * extents. so, we trust isize unless the start of the last extent is
- * beyond isize
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
*/
- if (last < isize) {
- last = (u64)-1;
- last_for_get_extent = isize;
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
}
- lock_extent_bits(&inode->io_tree, start, start + len - 1,
- &cached_state);
+ return 0;
+}
- em = get_extent_skip_holes(inode, start, last_for_get_extent);
- if (!em)
- goto out;
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_shared_cache *backref_cache;
+ struct ulist *roots;
+ struct ulist *tmp_ulist;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ bool stopped = false;
+ int ret;
+
+ backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+ path = btrfs_alloc_path();
+ roots = ulist_alloc(GFP_KERNEL);
+ tmp_ulist = ulist_alloc(GFP_KERNEL);
+ if (!backref_cache || !path || !roots || !tmp_ulist) {
+ ret = -ENOMEM;
goto out;
}
- while (!end) {
- u64 offset_in_extent = 0;
+ lockstart = round_down(start, root->fs_info->sectorsize);
+ lockend = round_up(start + len, root->fs_info->sectorsize);
+ prev_extent_end = lockstart;
- /* break if the extent we found is outside the range */
- if (em->start >= max || extent_map_end(em) < off)
- break;
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- /*
- * get_extent may return an extent that starts before our
- * requested range. We have to make sure the ranges
- * we return to fiemap always move forward and don't
- * overlap, so adjust the offsets here
- */
- em_start = max(em->start, off);
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, lockstart);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
/*
- * record the offset from the start of the extent
- * for adjusting the disk offset below. Only do this if the
- * extent isn't compressed since our in ram offset may be past
- * what we have actually allocated on disk.
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
*/
- if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- offset_in_extent = em_start - em->start;
- em_end = extent_map_end(em);
- em_len = em_end - em_start;
- flags = 0;
- if (em->block_start < EXTENT_MAP_LAST_BYTE)
- disko = em->block_start + offset_in_extent;
- else
- disko = 0;
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < lockend) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
/*
- * bump off for our next call to get_extent
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
*/
- off = extent_map_end(em);
- if (off >= max)
- end = 1;
-
- if (em->block_start == EXTENT_MAP_LAST_BYTE) {
- end = 1;
- flags |= FIEMAP_EXTENT_LAST;
- } else if (em->block_start == EXTENT_MAP_INLINE) {
- flags |= (FIEMAP_EXTENT_DATA_INLINE |
- FIEMAP_EXTENT_NOT_ALIGNED);
- } else if (em->block_start == EXTENT_MAP_DELALLOC) {
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- } else if (fieinfo->fi_extents_max) {
- u64 bytenr = em->block_start -
- (em->start - em->orig_start);
+ if (extent_end <= lockstart)
+ goto next_item;
- /*
- * As btrfs supports shared space, this information
- * can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
- * then we're just getting a count and we can skip the
- * lookup stuff.
- */
- ret = btrfs_check_shared(root, btrfs_ino(inode),
- bytenr, roots, tmp_ulist);
- if (ret < 0)
- goto out_free;
- if (ret)
- flags |= FIEMAP_EXTENT_SHARED;
- ret = 0;
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 range_end = min(key.offset, lockend) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ prev_extent_end, range_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= lockend) {
+ stopped = true;
+ break;
+ }
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
flags |= FIEMAP_EXTENT_ENCODED;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- free_extent_map(em);
- em = NULL;
- if ((em_start >= last) || em_len == (u64)-1 ||
- (last == (u64)-1 && isize <= em_end)) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache,
+ disk_bytenr, extent_offset,
+ extent_gen, roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(root, ino,
+ disk_bytenr,
+ extent_gen,
+ roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
}
- /* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
}
- if (!em) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
}
- ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
- em_len, flags);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out_free;
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
}
+ cond_resched();
}
-out_free:
- if (!ret)
- ret = emit_last_fiemap_cache(fieinfo, &cache);
- free_extent_map(em);
-out:
- unlock_extent_cached(&inode->io_tree, start, start + len - 1,
- &cached_state);
-out_free_ulist:
+check_eof_delalloc:
+ /*
+ * Release (and free) the path before emitting any final entries to
+ * fiemap_fill_next_extent() to keep lockdep happy. This is because
+ * once we find no more file extent items exist, we may have a
+ * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+ * faults when copying data to the user space buffer.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (!stopped && prev_extent_end < lockend) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+ 0, 0, 0, roots, tmp_ulist,
+ prev_extent_end, lockend - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = lockend;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
+ kfree(backref_cache);
btrfs_free_path(path);
ulist_free(roots);
ulist_free(tmp_ulist);
@@ -5856,7 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
__free_extent_buffer(eb);
}
@@ -5873,8 +4269,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->bflags = 0;
init_rwsem(&eb->lock);
- btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
- &fs_info->allocated_ebs);
+ btrfs_leak_debug_add_eb(eb);
INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
@@ -6342,7 +4737,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -6362,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb)
void free_extent_buffer(struct extent_buffer *eb)
{
int refs;
- int old;
if (!eb)
return;
+ refs = atomic_read(&eb->refs);
while (1) {
- refs = atomic_read(&eb->refs);
if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
|| (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
refs == 1))
break;
- old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
- if (old == refs)
+ if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
return;
}
@@ -6569,7 +4962,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
return -EAGAIN;
} else {
- ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
if (ret < 0)
return ret;
}
@@ -6579,7 +4972,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
return ret;
}
@@ -6587,13 +4980,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
eb->read_mirror = 0;
atomic_set(&eb->io_pages, 1);
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
- page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_extent_readpage, 0, true);
+ eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
@@ -6684,6 +5078,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
* set io_pages. See check_buffer_tree_ref for a more detailed comment.
*/
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -6696,9 +5091,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
ClearPageError(page);
err = submit_extent_page(REQ_OP_READ, NULL,
- &bio_ctrl, page, page_offset(page),
- PAGE_SIZE, 0, end_bio_extent_readpage,
- 0, false);
+ &bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4bc72a87b9a9..7929f054dda3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -60,11 +60,13 @@ enum {
struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
-struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
+
typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
enum btrfs_compression_type compress_type);
@@ -240,10 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
@@ -257,8 +259,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
struct page *page;
- u64 start;
u64 len;
u64 logical;
int this_mirror;
@@ -269,6 +275,9 @@ struct io_failure_record {
int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6fee14ce2e6b..6092a4eedc92 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -7,6 +7,7 @@
#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
+#include "btrfs_inode.h"
static struct kmem_cache *extent_map_cache;
@@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
- em->generation = 0;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(refcount_read(&em->refs) == 0);
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
@@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
* it can't be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct extent_map *entry;
struct extent_map *prev_entry = NULL;
+ ASSERT(prev_or_next_ret);
+
while (n) {
entry = rb_entry(n, struct extent_map, rb_node);
prev = n;
@@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return n;
}
- if (prev_ret) {
- orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *prev_ret = prev;
- prev = orig_prev;
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+
+ /*
+ * Previous extent map found, return as in this case the caller does not
+ * care about the next one.
+ */
+ if (prev) {
+ *prev_or_next_ret = prev;
+ return NULL;
}
- if (next_ret) {
+ prev = orig_prev;
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *next_ret = prev;
}
+ *prev_or_next_ret = prev;
+
return NULL;
}
@@ -336,6 +340,8 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
if (extent_map_in_tree(em))
try_merge_map(tree, em);
@@ -382,7 +388,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
__clear_extent_bit(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits,
- 0, 0, NULL, GFP_NOWAIT, NULL);
+ NULL, GFP_NOWAIT, NULL);
}
}
@@ -425,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
{
struct extent_map *em;
struct rb_node *rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *next = NULL;
+ struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
if (!rb_node) {
- if (prev)
- rb_node = prev;
- else if (next)
- rb_node = next;
+ if (prev_or_next)
+ rb_node = prev_or_next;
else
return NULL;
}
@@ -658,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
ASSERT(ret == 0 || ret == -EEXIST);
return ret;
}
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+ write_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ struct extent_map *em;
+ struct rb_node *node;
+
+ node = rb_first_cached(&tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(tree, em);
+ free_extent_map(em);
+ cond_resched_rwlock_write(&tree->lock);
+ }
+ write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+ bool skip_pinned)
+{
+ struct extent_map *split;
+ struct extent_map *split2;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 len = end - start + 1;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ if (start == 0 && !skip_pinned) {
+ drop_all_extent_maps_fast(em_tree);
+ return;
+ }
+ len = (u64)-1;
+ } else {
+ /* Make end offset exclusive for use in the loop below. */
+ end++;
+ }
+
+ /*
+ * It's ok if we fail to allocate the extent maps, see the comment near
+ * the bottom of the loop below. We only need two spare extent maps in
+ * the worst case, where the first extent map that intersects our range
+ * starts before the range and the last extent map that intersects our
+ * range ends after our range (and they might be the same extent map),
+ * because we need to split those two extent maps at the boundaries.
+ */
+ split = alloc_extent_map();
+ split2 = alloc_extent_map();
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+
+ while (em) {
+ /* extent_map_end() returns exclusive value (last byte + 1). */
+ const u64 em_end = extent_map_end(em);
+ struct extent_map *next_em = NULL;
+ u64 gen;
+ unsigned long flags;
+ bool modified;
+ bool compressed;
+
+ if (em_end < end) {
+ next_em = next_extent_map(em);
+ if (next_em) {
+ if (next_em->start < end)
+ refcount_inc(&next_em->refs);
+ else
+ next_em = NULL;
+ }
+ }
+
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ start = em_end;
+ if (end != (u64)-1)
+ len = start + len - em_end;
+ goto next;
+ }
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /*
+ * The extent map does not cross our target range, so no need to
+ * split it, we can remove it directly.
+ */
+ if (em->start >= start && em_end <= end)
+ goto remove_em;
+
+ flags = em->flags;
+ gen = em->generation;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ if (em->start < start) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em_end > end) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = start + len;
+ split->len = em_end - (start + len);
+ split->block_start = em->block_start;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->orig_start = em->orig_start;
+ } else {
+ const u64 diff = start + len - em->start;
+
+ split->block_len = split->len;
+ split->block_start += diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ int ret;
+
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ /* Logic error, shouldn't happen. */
+ ASSERT(ret == 0);
+ if (WARN_ON(ret != 0) && modified)
+ btrfs_set_inode_full_sync(inode);
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+remove_em:
+ if (extent_map_in_tree(em)) {
+ /*
+ * If the extent map is still in the tree it means that
+ * either of the following is true:
+ *
+ * 1) It fits entirely in our range (doesn't end beyond
+ * it or starts before it);
+ *
+ * 2) It starts before our range and/or ends after our
+ * range, and we were not able to allocate the extent
+ * maps for split operations, @split and @split2.
+ *
+ * If we are at case 2) then we just remove the entire
+ * extent map - this is fine since if anyone needs it to
+ * access the subranges outside our range, will just
+ * load it again from the subvolume tree's file extent
+ * item. However if the extent map was in the list of
+ * modified extents, then we must mark the inode for a
+ * full fsync, otherwise a fast fsync will miss this
+ * extent if it's new and needs to be logged.
+ */
+ if ((em->start < start || em_end > end) && modified) {
+ ASSERT(!split);
+ btrfs_set_inode_full_sync(inode);
+ }
+ remove_extent_mapping(em_tree, em);
+ }
+
+ /*
+ * Once for the tree reference (we replaced or removed the
+ * extent map from the tree).
+ */
+ free_extent_map(em);
+next:
+ /* Once for us (for our lookup reference). */
+ free_extent_map(em);
+
+ em = next_em;
+ }
+
+ write_unlock(&em_tree->lock);
+
+ free_extent_map(split);
+ free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode: The target inode.
+ * @new_em: The new extent map to add to the inode's extent map tree.
+ * @modified: Indicate if the new extent map should be added to the list of
+ * modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified)
+{
+ const u64 end = new_em->start + new_em->len - 1;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret;
+
+ ASSERT(!extent_map_in_tree(new_em));
+
+ /*
+ * The caller has locked an appropriate file range in the inode's io
+ * tree, but getting -EEXIST when adding the new extent map can still
+ * happen in case there are extents that partially cover the range, and
+ * this is due to two tasks operating on different parts of the extent.
+ * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+ * btrfs_get_extent") for an example and details.
+ */
+ do {
+ btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+ write_lock(&tree->lock);
+ ret = add_extent_mapping(tree, new_em, modified);
+ write_unlock(&tree->lock);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d2fa32ffe304..ad311864272a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -63,6 +63,8 @@ struct extent_map_tree {
rwlock_t lock;
};
+struct btrfs_inode;
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
@@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ u64 start, u64 end,
+ bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c828f971a346..6bb9fa961a6a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+ start + len - 1, EXTENT_DIRTY, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
@@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
return ncsums * fs_info->sectorsize;
}
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+ int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+}
+
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding)
+ u64 objectid, u64 pos, u64 num_bytes)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, 0);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, compression);
- btrfs_set_file_extent_encryption(leaf, item, encryption);
- btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+ btrfs_set_file_extent_compression(leaf, item, 0);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
btrfs_mark_buffer_dirty(leaf);
out:
@@ -503,7 +511,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit)
+ struct list_head *list, int search_commit,
+ bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -525,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
if (search_commit) {
path->skip_locking = 1;
path->reada = READA_FORWARD;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5a3f6e0d9688..176b432035ae 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -473,7 +473,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
*/
clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached);
+ cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -499,159 +499,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
}
/*
- * this drops all the extents in the cache that intersect the range
- * [start, end]. Existing extents are split as required.
- */
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned)
-{
- struct extent_map *em;
- struct extent_map *split = NULL;
- struct extent_map *split2 = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- u64 len = end - start + 1;
- u64 gen;
- int ret;
- int testend = 1;
- unsigned long flags;
- int compressed = 0;
- bool modified;
-
- WARN_ON(end < start);
- if (end == (u64)-1) {
- len = (u64)-1;
- testend = 0;
- }
- while (1) {
- int no_splits = 0;
-
- modified = false;
- if (!split)
- split = alloc_extent_map();
- if (!split2)
- split2 = alloc_extent_map();
- if (!split || !split2)
- no_splits = 1;
-
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
- write_unlock(&em_tree->lock);
- break;
- }
- flags = em->flags;
- gen = em->generation;
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- if (testend && em->start + em->len >= start + len) {
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- break;
- }
- start = em->start + em->len;
- if (testend)
- len = start + len - (em->start + em->len);
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- continue;
- }
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
- if (no_splits)
- goto next;
-
- if (em->start < start) {
- split->start = em->start;
- split->len = start - em->start;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
- split->ram_bytes = em->ram_bytes;
- } else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- split->ram_bytes = split->len;
- }
-
- split->generation = gen;
- split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
- free_extent_map(split);
- split = split2;
- split2 = NULL;
- }
- if (testend && em->start + em->len > start + len) {
- u64 diff = start + len - em->start;
-
- split->start = start + len;
- split->len = em->start + em->len - (start + len);
- split->flags = flags;
- split->compress_type = em->compress_type;
- split->generation = gen;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
- split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->block_start = em->block_start;
- split->orig_start = em->orig_start;
- } else {
- split->block_len = split->len;
- split->block_start = em->block_start
- + diff;
- split->orig_start = em->orig_start;
- }
- } else {
- split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- }
-
- if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
- } else {
- ret = add_extent_mapping(em_tree, split,
- modified);
- ASSERT(ret == 0); /* Logic error */
- }
- free_extent_map(split);
- split = NULL;
- }
-next:
- if (extent_map_in_tree(em))
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us */
- free_extent_map(em);
- /* once for the tree*/
- free_extent_map(em);
- }
- if (split)
- free_extent_map(split);
- if (split2)
- free_extent_map(split2);
-}
-
-/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file.
@@ -708,7 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
}
if (args->drop_cache)
- btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+ btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
@@ -1339,26 +1186,54 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+ unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+ if (nowait)
+ fgp_flags |= FGP_NOWAIT;
+
+ return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+ gfp_t gfp;
+
+ gfp = btrfs_alloc_write_mask(inode->i_mapping);
+ if (nowait) {
+ gfp &= ~__GFP_DIRECT_RECLAIM;
+ gfp |= GFP_NOWAIT;
+ }
+
+ return gfp;
+}
+
/*
* this just gets pages into the page cache and locks them down.
*/
static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate)
+ size_t write_bytes, bool force_uptodate,
+ bool nowait)
{
int i;
unsigned long index = pos >> PAGE_SHIFT;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+ unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again:
- pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask | __GFP_WRITE);
+ pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+ fgp_flags, mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
- err = -ENOMEM;
+ if (nowait)
+ err = -EAGAIN;
+ else
+ err = -ENOMEM;
goto fail;
}
@@ -1376,7 +1251,7 @@ again:
pos + write_bytes, false);
if (err) {
put_page(pages[i]);
- if (err == -EAGAIN) {
+ if (!nowait && err == -EAGAIN) {
err = 0;
goto again;
}
@@ -1411,7 +1286,7 @@ static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes,
- u64 *lockstart, u64 *lockend,
+ u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1426,15 +1301,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ return -EAGAIN;
+ }
+ } else {
+ lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state);
+ unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1481,7 +1368,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1500,17 +1387,22 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
- btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ if (nowait) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return -EAGAIN;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ }
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, false);
- if (ret <= 0) {
- ret = 0;
+ NULL, NULL, NULL, nowait, false);
+ if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
- } else {
+ else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- }
- unlock_extent(&inode->io_tree, lockstart, lockend);
+ unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
return ret;
}
@@ -1607,8 +1499,10 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+ unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
ret = btrfs_inode_lock(inode, ilock_flags);
@@ -1664,18 +1558,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
- write_bytes);
+ write_bytes, nowait);
if (ret < 0) {
+ int can_nocow;
+
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+ ret = -EAGAIN;
+ break;
+ }
+
/*
* If we don't have to COW at the offset, reserve
* metadata only. write_bytes may get smaller than
* requested here.
*/
- if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes) > 0)
- only_release_metadata = true;
- else
+ can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
break;
+ only_release_metadata = true;
}
num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
@@ -1685,7 +1590,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
- reserve_bytes, false);
+ reserve_bytes, nowait);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1698,14 +1603,17 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
release_bytes = reserve_bytes;
again:
+ ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+ if (ret)
+ break;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes,
- force_page_uptodate);
+ pos, write_bytes, force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
@@ -1715,10 +1623,11 @@ again:
extents_locked = lock_and_cleanup_extent_if_need(
BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
- &lockend, &cached_state);
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
- if (extents_locked == -EAGAIN)
+ if (!nowait && extents_locked == -EAGAIN)
goto again;
+
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
ret = extents_locked;
@@ -1782,8 +1691,8 @@ again:
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
else
free_extent_state(cached_state);
@@ -1801,8 +1710,6 @@ again:
cond_resched();
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
pos += copied;
num_written += copied;
}
@@ -2045,7 +1952,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
if (sync)
@@ -2201,14 +2108,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * Always check for the full sync flag while holding the inode's lock,
- * to avoid races with other tasks. The flag must be either set all the
- * time during logging or always off all the time while logging.
- */
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
-
- /*
* Before we acquired the inode's lock and the mmap lock, someone may
* have dirtied more pages in the target range. We need to make sure
* that writeback for any such pages does not start while we are logging
@@ -2233,6 +2132,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/*
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
+ * We check the flag here after starting delalloc above, because when
+ * running delalloc the full sync flag may be set if we need to drop
+ * extra extent map ranges due to temporary memory allocation failures.
+ */
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
@@ -2380,6 +2290,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
+ ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
@@ -2448,7 +2359,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
- struct extent_map_tree *em_tree = &inode->extent_tree;
struct btrfs_key key;
int ret;
@@ -2505,8 +2415,8 @@ static int fill_holes(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+ end - offset);
if (ret)
return ret;
@@ -2515,7 +2425,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
@@ -2529,12 +2439,7 @@ out:
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
- do {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- } while (ret == -EEXIST);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
@@ -2591,8 +2496,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2607,8 +2512,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
page_lockend))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -3008,9 +2913,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
if (ret)
goto out_only_mutex;
- lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
- lockend = round_down(offset + len,
- btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
+ lockstart = round_up(offset, fs_info->sectorsize);
+ lockend = round_down(offset + len, fs_info->sectorsize) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -3108,8 +3012,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -3212,7 +3116,7 @@ enum {
static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
struct extent_map *em;
int ret;
@@ -3242,7 +3146,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ const u64 sectorsize = fs_info->sectorsize;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3382,16 +3286,16 @@ reserve_space:
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
offset + len, &alloc_hint);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -3428,7 +3332,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 data_space_reserved = 0;
u64 qgroup_reserved = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
/* Do not allow fallocate in ZONED mode */
@@ -3502,8 +3406,8 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
@@ -3592,31 +3496,290 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ const u64 len = end + 1 - start;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ u64 em_end;
+ u64 delalloc_len;
+
+ /*
+ * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+ * means we have delalloc (dirty pages) for which writeback has not
+ * started yet.
+ */
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1);
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
+ if (delalloc_len > 0)
+ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+ /*
+ * Now also check if there's any extent map in the range that does not
+ * map to a hole or prealloc extent. We do this because:
+ *
+ * 1) When delalloc is flushed, the file range is locked, we clear the
+ * EXTENT_DELALLOC bit from the io tree and create an extent map for
+ * an allocated extent. So we might just have been called after
+ * delalloc is flushed and before the ordered extent completes and
+ * inserts the new file extent item in the subvolume's btree;
+ *
+ * 2) We may have an extent map created by flushing delalloc for a
+ * subrange that starts before the subrange we found marked with
+ * EXTENT_DELALLOC in the io tree.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ /* extent_map_end() returns a non-inclusive end offset. */
+ em_end = em ? extent_map_end(em) : 0;
+
+ /*
+ * If we have a hole/prealloc extent map, check the next one if this one
+ * ends before our range's end.
+ */
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+ struct extent_map *next_em;
+
+ read_lock(&em_tree->lock);
+ next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
+ read_unlock(&em_tree->lock);
+
+ free_extent_map(em);
+ em_end = next_em ? extent_map_end(next_em) : 0;
+ em = next_em;
+ }
+
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ /*
+ * No extent map or one for a hole or prealloc extent. Use the delalloc
+ * range we found in the io tree if we have one.
+ */
+ if (!em)
+ return (delalloc_len > 0);
+
+ /*
+ * We don't have any range as EXTENT_DELALLOC in the io tree, so the
+ * extent map is the only subrange representing delalloc.
+ */
+ if (delalloc_len == 0) {
+ *delalloc_start_ret = em->start;
+ *delalloc_end_ret = min(end, em_end - 1);
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map represents a delalloc range that starts before the
+ * delalloc range we found in the io tree.
+ */
+ if (em->start < *delalloc_start_ret) {
+ *delalloc_start_ret = em->start;
+ /*
+ * If the ranges are adjacent, return a combined range.
+ * Otherwise return the extent map's range.
+ */
+ if (em_end < *delalloc_start_ret)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map starts after the delalloc range we found in the io
+ * tree. If it's adjacent, return a combined range, otherwise return
+ * the range found in the io tree.
+ */
+ if (*delalloc_end_ret + 1 == em->start)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode: The inode.
+ * @start: The start offset of the range. It does not need to be
+ * sector size aligned.
+ * @end: The end offset (inclusive value) of the search range.
+ * It does not need to be sector size aligned.
+ * @delalloc_start_ret: Output argument, set to the start offset of the
+ * subrange found with delalloc (may not be sector size
+ * aligned).
+ * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
+ * of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+ u64 prev_delalloc_end = 0;
+ bool ret = false;
+
+ while (cur_offset < end) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ if (prev_delalloc_end == 0) {
+ /* First subrange found. */
+ *delalloc_start_ret = max(delalloc_start, start);
+ *delalloc_end_ret = delalloc_end;
+ ret = true;
+ } else if (delalloc_start == prev_delalloc_end + 1) {
+ /* Subrange adjacent to the previous one, merge them. */
+ *delalloc_end_ret = delalloc_end;
+ } else {
+ /* Subrange not adjacent to the previous one, exit. */
+ break;
+ }
+
+ prev_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode: The inode.
+ * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start: Start offset of the hole region. It does not need to be sector
+ * size aligned.
+ * @end: End offset (inclusive value) of the hole region. It does not
+ * need to be sector size aligned.
+ * @start_ret: Return parameter, used to set the start of the subrange in the
+ * hole that matches the search criteria (seek mode), if such
+ * subrange is found (return value of the function is true).
+ * The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ u64 start, u64 end, u64 *start_ret)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ &delalloc_start, &delalloc_end);
+ if (delalloc && whence == SEEK_DATA) {
+ *start_ret = delalloc_start;
+ return true;
+ }
+
+ if (delalloc && whence == SEEK_HOLE) {
+ /*
+ * We found delalloc but it starts after out start offset. So we
+ * have a hole between our start offset and the delalloc start.
+ */
+ if (start < delalloc_start) {
+ *start_ret = start;
+ return true;
+ }
+ /*
+ * Delalloc range starts at our start offset.
+ * If the delalloc range's length is smaller than our range,
+ * then it means we have a hole that starts where the delalloc
+ * subrange ends.
+ */
+ if (delalloc_end < end) {
+ *start_ret = delalloc_end + 1;
+ return true;
+ }
+
+ /* There's delalloc for the whole range. */
+ return false;
+ }
+
+ if (!delalloc && whence == SEEK_HOLE) {
+ *start_ret = start;
+ return true;
+ }
+
+ /*
+ * No delalloc in the range and we are seeking for data. The caller has
+ * to iterate to the next extent item in the subvolume btree.
+ */
+ return false;
+}
+
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->vfs_inode.i_size;
+ const loff_t i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 last_extent_end;
u64 lockstart;
u64 lockend;
u64 start;
- u64 len;
- int ret = 0;
+ int ret;
+ bool found = false;
if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
+ * Quick path. If the inode has no prealloc extents and its number of
+ * bytes used matches its i_size, then it can not have holes.
+ */
+ if (whence == SEEK_HOLE &&
+ !(inode->flags & BTRFS_INODE_PREALLOC) &&
+ inode_get_bytes(&inode->vfs_inode) == i_size)
+ return i_size;
+
+ /*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
@@ -3627,45 +3790,164 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
- len = lockend - lockstart + 1;
- lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ last_extent_end = lockstart;
+
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
while (start < i_size) {
- em = btrfs_get_extent_fiemap(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- em = NULL;
- break;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
}
- if (whence == SEEK_HOLE &&
- (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
- break;
- else if (whence == SEEK_DATA &&
- (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
- start = em->start + em->len;
- free_extent_map(em);
- em = NULL;
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * In the first iteration we may have a slot that points to an
+ * extent that ends before our start offset, so skip it.
+ */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ /* We have an implicit hole, NO_HOLES feature is likely set. */
+ if (last_extent_end < key.offset) {
+ u64 search_start = last_extent_end;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ key.offset - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the extent.
+ */
+ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+ btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
+ */
+ u64 search_start = key.offset;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ extent_end - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the next
+ * extent item.
+ */
+ } else {
+ /*
+ * Found a regular or inline extent.
+ * If we are seeking for data, adjust the start offset
+ * and stop, we're done.
+ */
+ if (whence == SEEK_DATA) {
+ start = max_t(u64, key.offset, offset);
+ found = true;
+ break;
+ }
+ /*
+ * Else, we are seeking for a hole, check the next file
+ * extent item.
+ */
+ }
+
+ start = extent_end;
+ last_extent_end = extent_end;
+ path->slots[0]++;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
}
- free_extent_map(em);
- unlock_extent_cached(&inode->io_tree, lockstart, lockend,
- &cached_state);
- if (ret) {
- offset = ret;
- } else {
- if (whence == SEEK_DATA && start >= i_size)
- offset = -ENXIO;
- else
- offset = min_t(loff_t, start, i_size);
+
+ /* We have an implicit hole from the last extent found up to i_size. */
+ if (!found && start < i_size) {
+ found = find_desired_extent_in_hole(inode, whence, start,
+ i_size - 1, &start);
+ if (!found)
+ start = i_size;
}
- return offset;
+out:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_free_path(path);
+
+ if (ret < 0)
+ return ret;
+
+ if (whence == SEEK_DATA && start >= i_size)
+ return -ENXIO;
+
+ return min_t(loff_t, start, i_size);
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -3693,7 +3975,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3810,6 +4092,7 @@ const struct file_operations btrfs_file_operations = {
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 996da650ecdc..f4023651dd68 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -48,6 +48,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
+
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
@@ -126,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
- if (!block_group->iref) {
+ if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
block_group->inode = igrab(inode);
- block_group->iref = 1;
- }
spin_unlock(&block_group->lock);
return inode;
@@ -241,8 +257,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
block_group->inode = NULL;
spin_unlock(&block_group->lock);
iput(inode);
@@ -333,8 +348,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
* We skip the throttling logic for free space cache inodes, so we don't
@@ -345,7 +360,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
+ unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -693,6 +708,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
max_bitmaps = max_t(u64, max_bitmaps, 1);
+ if (ctl->total_bitmaps > max_bitmaps)
+ btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+ block_group->start, block_group->length,
+ ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ bytes_per_bg);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
@@ -875,7 +896,10 @@ out:
return ret;
free_cache:
io_ctl_drop_pages(&io_ctl);
+
+ spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache(ctl);
+ spin_unlock(&ctl->tree_lock);
goto out;
}
@@ -914,6 +938,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
return ret;
}
+static struct lock_class_key btrfs_free_space_inode_key;
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -983,6 +1009,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
+ /*
+ * Reinitialize the class of struct inode's mapping->invalidate_lock for
+ * free space inodes to prevent false positives related to locks for normal
+ * inodes.
+ */
+ lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+ &btrfs_free_space_inode_key);
+
ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
@@ -1001,7 +1035,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
if (ret == 0)
ret = 1;
} else {
+ /*
+ * We need to call the _locked variant so we don't try to update
+ * the discard counters.
+ */
+ spin_lock(&tmp_ctl.tree_lock);
__btrfs_remove_free_space_cache(&tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -1123,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1135,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC, 0,
- 0, NULL);
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1232,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1252,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1378,8 +1418,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1434,8 +1474,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2860,7 +2900,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (btrfs_is_zoned(fs_info)) {
btrfs_info(fs_info, "free space %llu active %d",
block_group->zone_capacity - block_group->alloc_offset,
- block_group->zone_is_active);
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags));
return;
}
@@ -2964,34 +3005,6 @@ static void __btrfs_return_cluster_to_free_space(
btrfs_put_block_group(block_group);
}
-static void __btrfs_remove_free_space_cache_locked(
- struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *node;
-
- while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, offset_index);
- if (!info->bitmap) {
- unlink_free_space(ctl, info, true);
- kmem_cache_free(btrfs_free_space_cachep, info);
- } else {
- free_bitmap(ctl, info);
- }
-
- cond_resched_lock(&ctl->tree_lock);
- }
-}
-
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
-{
- spin_lock(&ctl->tree_lock);
- __btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->block_group)
- btrfs_discard_update_discardable(ctl->block_group);
- spin_unlock(&ctl->tree_lock);
-}
-
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3009,7 +3022,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
- __btrfs_remove_free_space_cache_locked(ctl);
+ __btrfs_remove_free_space_cache(ctl);
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
@@ -3992,7 +4005,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4022,7 +4035,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4044,7 +4057,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 15591b299895..6d419ba53e95 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 1bf89aa67216..367bcfcf68f5 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1453,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bit = free_space_test_bit(block_group, path, offset);
@@ -1490,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
@@ -1531,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1552,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1372210869b1..b0807c59e321 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -977,7 +977,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end);
+ lock_extent(io_tree, start, end, NULL);
/* We have fall back to uncompressed write */
if (!async_extent->pages)
@@ -1024,7 +1024,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1254,7 +1254,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1319,8 +1318,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(inode, start,
- start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + ram_size - 1,
+ false);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1360,7 +1360,7 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1524,7 +1524,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
unsigned nofs_flag;
const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
@@ -1666,7 +1666,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
}
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes)
+ u64 bytenr, u64 num_bytes, bool nowait)
{
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
@@ -1674,7 +1674,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(csum_root, bytenr,
- bytenr + num_bytes - 1, &list, 0);
+ bytenr + num_bytes - 1, &list, 0,
+ nowait);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1747,7 +1748,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
if (count > 0)
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- 0, 0, NULL);
+ NULL);
}
return cow_file_range(inode, locked_page, start, end, page_started,
@@ -1800,6 +1801,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
u8 extent_type;
int can_nocow = 0;
int ret = 0;
+ bool nowait = path->nowait;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
@@ -1876,7 +1878,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+ nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -2099,8 +2102,8 @@ out_check:
1 << BTRFS_ORDERED_PREALLOC,
BTRFS_COMPRESS_NONE);
if (ret) {
- btrfs_drop_extent_cache(inode, cur_offset,
- nocow_end, 0);
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ nocow_end, false);
goto error;
}
} else {
@@ -2548,7 +2551,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
ASSERT(pre + post < len);
- lock_extent(&inode->io_tree, start, start + len - 1);
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
@@ -2622,7 +2625,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
@@ -2700,8 +2703,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
ret = extract_ordered_extent(bi, bio,
page_offset(bio_first_bvec_all(bio)->bv_page));
- if (ret)
- goto out;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
}
/*
@@ -2721,16 +2726,12 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
return;
ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
- if (ret)
- goto out;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
}
btrfs_submit_bio(fs_info, bio, mirror_num);
- return;
-out:
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
}
void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
@@ -2757,8 +2758,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
*/
ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
@@ -2818,8 +2818,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
- GFP_NOFS, NULL);
+ EXTENT_DELALLOC_NEW, cached_state,
+ GFP_NOFS);
next:
search_start = extent_map_end(em);
free_extent_map(em);
@@ -2931,7 +2931,7 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PageOrdered(page))
@@ -2939,8 +2939,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -2966,8 +2966,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -3225,6 +3224,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -3269,7 +3270,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
clear_bits |= EXTENT_LOCKED;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -3325,7 +3326,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
clear_extent_bit(&inode->io_tree, start, end,
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- 0, 0, &cached_state);
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
@@ -3336,7 +3337,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
if (trans)
@@ -3361,8 +3361,8 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
+ /* Drop extent maps for the part of the extent we didn't write. */
+ btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3439,6 +3439,13 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
return 0;
}
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
+}
+
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
@@ -4878,9 +4885,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
- blocksize);
+ blocksize, false);
if (ret < 0) {
- if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
@@ -4922,12 +4929,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, block_start, block_end, &cached_state);
+ lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(ordered, 1);
@@ -4937,13 +4943,12 @@ again:
clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
@@ -4960,11 +4965,11 @@ again:
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
+ EXTENT_NORESERVE, NULL, GFP_NOFS);
out_unlock:
if (ret) {
@@ -5021,8 +5026,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
if (ret) {
btrfs_abort_transaction(trans, ret);
} else {
@@ -5046,7 +5050,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -5094,10 +5097,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (err)
break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ cur_offset + hole_size - 1,
+ false);
btrfs_set_inode_full_sync(inode);
goto next;
}
@@ -5112,16 +5116,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
- while (1) {
- write_lock(&em_tree->lock);
- err = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- if (err != -EEXIST)
- break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset +
- hole_size - 1, 0);
- }
+ err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
err = btrfs_inode_set_file_extent_range(inode,
@@ -5137,7 +5132,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
+ unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -5271,7 +5266,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
* While truncating the inode pages during eviction, we get the VFS
* calling btrfs_invalidate_folio() against each folio of the inode. This
* is slow because the calls to btrfs_invalidate_folio() result in a
- * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
* which keep merging and splitting extent_state structures over and over,
* wasting lots of time.
*
@@ -5283,29 +5278,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
static void evict_inode_truncate_pages(struct inode *inode)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
- write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
- struct extent_map *em;
-
- node = rb_first_cached(&map_tree->map);
- em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(map_tree, em);
- free_extent_map(em);
- if (need_resched()) {
- write_unlock(&map_tree->lock);
- cond_resched();
- write_lock(&map_tree->lock);
- }
- }
- write_unlock(&map_tree->lock);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* Keep looping until we have no more ranges in the io tree.
@@ -5338,7 +5316,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5353,8 +5331,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
end - start + 1);
clear_extent_bit(io_tree, start, end,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
&cached_state);
cond_resched();
@@ -5707,6 +5684,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+ if (args->root && args->root == args->root->fs_info->tree_root &&
+ args->ino != BTRFS_BTREE_INODE_OBJECTID)
+ set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+ &BTRFS_I(inode)->runtime_flags);
return 0;
}
@@ -6867,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_io_tree *io_tree = &inode->io_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -7030,8 +7011,6 @@ next:
}
flush_dcache_page(page);
}
- set_extent_uptodate(io_tree, em->start,
- extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
}
not_found:
@@ -7065,133 +7044,6 @@ out:
return em;
}
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len)
-{
- struct extent_map *em;
- struct extent_map *hole_em = NULL;
- u64 delalloc_start = start;
- u64 end;
- u64 delalloc_len;
- u64 delalloc_end;
- int err = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, start, len);
- if (IS_ERR(em))
- return em;
- /*
- * If our em maps to:
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
-
- /* check to see if we've wrapped (len == -1 or similar) */
- end = start + len;
- if (end < start)
- end = (u64)-1;
- else
- end -= 1;
-
- em = NULL;
-
- /* ok, we didn't find anything, lets look for delalloc */
- delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
- end, len, EXTENT_DELALLOC, 1);
- delalloc_end = delalloc_start + delalloc_len;
- if (delalloc_end < delalloc_start)
- delalloc_end = (u64)-1;
-
- /*
- * We didn't find anything useful, return the original results from
- * get_extent()
- */
- if (delalloc_start > end || delalloc_end <= start) {
- em = hole_em;
- hole_em = NULL;
- goto out;
- }
-
- /*
- * Adjust the delalloc_start to make sure it doesn't go backwards from
- * the start they passed in
- */
- delalloc_start = max(start, delalloc_start);
- delalloc_len = delalloc_end - delalloc_start;
-
- if (delalloc_len > 0) {
- u64 hole_start;
- u64 hole_len;
- const u64 hole_end = extent_map_end(hole_em);
-
- em = alloc_extent_map();
- if (!em) {
- err = -ENOMEM;
- goto out;
- }
-
- ASSERT(hole_em);
- /*
- * When btrfs_get_extent can't find anything it returns one
- * huge hole
- *
- * Make sure what it found really fits our range, and adjust to
- * make sure it is based on the start from the caller
- */
- if (hole_end <= start || hole_em->start > end) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = hole_end - hole_start;
- }
-
- if (hole_em && delalloc_start > hole_start) {
- /*
- * Our hole starts before our delalloc, so we have to
- * return just the parts of the hole that go until the
- * delalloc starts
- */
- em->len = min(hole_len, delalloc_start - hole_start);
- em->start = hole_start;
- em->orig_start = hole_start;
- /*
- * Don't adjust block start at all, it is fixed at
- * EXTENT_MAP_HOLE
- */
- em->block_start = hole_em->block_start;
- em->block_len = hole_len;
- if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- } else {
- /*
- * Hole is out of passed range or it starts after
- * delalloc range
- */
- em->start = delalloc_start;
- em->len = delalloc_len;
- em->orig_start = delalloc_start;
- em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = delalloc_len;
- }
- } else {
- return hole_em;
- }
-out:
-
- free_extent_map(hole_em);
- if (err) {
- free_extent_map(em);
- return ERR_PTR(err);
- }
- return em;
-}
-
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
@@ -7221,7 +7073,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + len - 1, false);
}
em = ERR_PTR(ret);
}
@@ -7292,7 +7145,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict)
+ u64 *ram_bytes, bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7308,6 +7161,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), offset, 0);
@@ -7404,7 +7258,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
if (!try_lock_extent(io_tree, lockstart, lockend))
return -EAGAIN;
} else {
- lock_extent_bits(io_tree, lockstart, lockend, cached_state);
+ lock_extent(io_tree, lockstart, lockend, cached_state);
}
/*
* We're concerned with the entire range that we're going to be
@@ -7426,7 +7280,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
if (nowait) {
@@ -7488,7 +7342,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
int ret;
@@ -7497,7 +7350,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7518,18 +7370,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->compress_type = compress_type;
}
- do {
- btrfs_drop_extent_cache(inode, em->start,
- em->start + em->len - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- /*
- * The caller has taken lock_extent(), who could race with us
- * to add em?
- */
- } while (ret == -EEXIST);
-
+ ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
free_extent_map(em);
return ERR_PTR(ret);
@@ -7577,7 +7418,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false) == 1) {
+ &orig_block_len, &ram_bytes, false, false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -7762,7 +7603,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write && !(flags & IOMAP_NOWAIT)) {
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&dio_data->data_reserved,
- start, data_alloc_len);
+ start, data_alloc_len, false);
if (!ret)
dio_data->data_space_reserved = true;
else if (ret && !(BTRFS_I(inode)->flags &
@@ -7884,8 +7725,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
if (unlock_extents)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
else
free_extent_state(cached_state);
@@ -7914,8 +7755,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
return 0;
unlock_err:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -7938,7 +7779,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
return 0;
}
@@ -7950,7 +7792,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1);
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
@@ -7975,7 +7817,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
- dip->file_offset + dip->bytes - 1);
+ dip->file_offset + dip->bytes - 1, NULL);
}
kfree(dip->csums);
@@ -7986,7 +7828,7 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
int mirror_num,
enum btrfs_compression_type compress_type)
{
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
@@ -8001,8 +7843,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
blk_status_t err = BLK_STS_OK;
struct bvec_iter iter;
@@ -8015,9 +7855,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
if (uptodate &&
(!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
bv.bv_offset))) {
- clean_io_failure(fs_info, failure_tree, io_tree, start,
- bv.bv_page, btrfs_ino(BTRFS_I(inode)),
- bv.bv_offset);
+ btrfs_clean_io_failure(BTRFS_I(inode), start,
+ bv.bv_page, bv.bv_offset);
} else {
int ret;
@@ -8039,10 +7878,10 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
-static void btrfs_end_dio_bio(struct bio *bio)
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_dio_private *dip = bbio->private;
+ struct bio *bio = &bbio->bio;
blk_status_t err = bio->bi_status;
if (err)
@@ -8068,7 +7907,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
blk_status_t ret;
/* Save the original iter for read repair */
@@ -8091,8 +7930,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
} else {
@@ -8175,9 +8013,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+ btrfs_end_dio_bio, dip);
btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
@@ -8259,6 +8096,25 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
@@ -8391,14 +8247,14 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, &cached_state);
+ lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
struct btrfs_ordered_extent *ordered;
- bool delete_states;
u64 range_end;
u32 range_len;
+ u32 extra_flags = 0;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
page_end + 1 - cur);
@@ -8408,7 +8264,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* No ordered extent covering this range, we are safe
* to delete all extent states in the range.
*/
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
if (ordered->file_offset > cur) {
@@ -8419,7 +8275,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* the ordered extent in the next iteration.
*/
range_end = ordered->file_offset - 1;
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
@@ -8434,7 +8290,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
*/
- delete_states = false;
goto next;
}
btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
@@ -8451,7 +8306,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8459,6 +8314,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
cur - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
+ /*
+ * If the ordered extent has finished, we're safe to delete all
+ * the extent states of the range, otherwise
+ * btrfs_finish_ordered_io() will get executed by endio for
+ * other pages, so we can't delete extent states.
+ */
if (btrfs_dec_test_ordered_pending(inode, &ordered,
cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
@@ -8466,14 +8327,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* The ordered extent has finished, now we're again
* safe to delete all extent states of the range.
*/
- delete_states = true;
- } else {
- /*
- * btrfs_finish_ordered_io() will get executed by endio
- * of other pages, thus we can't delete extent states
- * anymore
- */
- delete_states = false;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
}
next:
if (ordered)
@@ -8497,8 +8351,8 @@ next:
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete_states, &cached_state);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+ extra_flags, &cached_state);
}
cur = range_end + 1;
}
@@ -8589,11 +8443,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+ lock_extent(io_tree, page_start, page_end, &cached_state);
ret2 = set_page_extent_mapped(page);
if (ret2 < 0) {
ret = vmf_error(ret2);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -8604,8 +8458,7 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
@@ -8633,13 +8486,12 @@ again:
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -8659,7 +8511,7 @@ again:
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
@@ -8760,24 +8612,24 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
&cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_cache(BTRFS_I(inode),
- ALIGN(new_size, fs_info->sectorsize),
- (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
inode_sub_bytes(inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -8908,6 +8760,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
+ spin_lock_init(&ei->io_failure_lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8924,12 +8777,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
- extent_io_tree_init(fs_info, &ei->io_failure_tree,
- IO_TREE_INODE_IO_FAILURE, inode);
extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT, inode);
- ei->io_tree.track_uptodate = true;
- ei->io_failure_tree.track_uptodate = true;
+ IO_TREE_INODE_FILE_EXTENT, NULL);
+ ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -8944,7 +8794,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
@@ -8959,6 +8809,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_root *root = inode->root;
+ bool freespace_inode;
WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
WARN_ON(vfs_inode->i_data.nrpages);
@@ -8980,6 +8831,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!root)
return;
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
@@ -8988,6 +8845,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
+
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -8995,7 +8856,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
}
@@ -10008,7 +9869,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -10064,11 +9924,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset -1, 0);
-
em = alloc_extent_map();
if (!em) {
+ btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+ cur_offset + ins.offset - 1, false);
btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10083,16 +9942,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset - 1,
- 0);
- }
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
free_extent_map(em);
next:
num_bytes -= ins.offset;
@@ -10168,7 +10018,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
@@ -10176,7 +10026,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct inode *inode;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
- .dentry = dentry,
+ .dentry = file->f_path.dentry,
.orphan = true,
};
unsigned int trans_num_items;
@@ -10213,7 +10063,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
set_nlink(inode, 1);
if (!ret) {
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
mark_inode_dirty(inode);
}
@@ -10225,7 +10075,7 @@ out_new_inode_args:
out_inode:
if (ret)
iput(inode);
- return ret;
+ return finish_open_simple(file, ret);
}
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
@@ -10346,7 +10196,7 @@ static ssize_t btrfs_encoded_read_inline(
}
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent_cached(io_tree, start, lockend, cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10371,7 +10221,7 @@ struct btrfs_encoded_read_private {
static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
struct bio *bio, int mirror_num)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
@@ -10389,7 +10239,7 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
{
const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
- struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_encoded_read_private *priv = bbio->private;
struct btrfs_inode *inode = priv->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 sectorsize = fs_info->sectorsize;
@@ -10417,10 +10267,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
return BLK_STS_OK;
}
-static void btrfs_encoded_read_endio(struct bio *bio)
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_encoded_read_private *priv = bbio->private;
blk_status_t status;
status = btrfs_encoded_read_verify_csum(bbio);
@@ -10438,7 +10287,7 @@ static void btrfs_encoded_read_endio(struct bio *bio)
if (!atomic_dec_return(&priv->pending))
wake_up(&priv->wait);
btrfs_bio_free_csum(bbio);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
@@ -10485,12 +10334,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, remaining, PAGE_SIZE);
if (!bio) {
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ btrfs_encoded_read_endio,
+ &priv);
bio->bi_iter.bi_sector =
(disk_bytenr + cur) >> SECTOR_SHIFT;
- bio->bi_end_io = btrfs_encoded_read_endio;
- bio->bi_private = &priv;
- bio->bi_opf = REQ_OP_READ;
}
if (!bytes ||
@@ -10551,7 +10399,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
if (ret)
goto out;
- unlock_extent_cached(io_tree, start, lockend, cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10621,13 +10469,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
- lock_extent_bits(io_tree, start, lockend, &cached_state);
+ lock_extent(io_tree, start, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
cond_resched();
}
@@ -10701,7 +10549,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
em = NULL;
if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
@@ -10722,7 +10570,7 @@ out_em:
free_extent_map(em);
out_unlock_extent:
if (!unlocked)
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
out_unlock_inode:
if (!unlocked)
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
@@ -10860,14 +10708,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_pages;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -10921,7 +10769,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
(1 << BTRFS_ORDERED_COMPRESSED),
compression);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserved;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -10929,7 +10777,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -10960,7 +10808,7 @@ out_free_data_space:
if (!extent_reserved)
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
out_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i])
@@ -11201,7 +11049,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ lock_extent(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
@@ -11242,7 +11090,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -11338,7 +11186,7 @@ out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
- unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+ unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fe0cc816b4eb..d5dd8bed1488 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1218,10 +1218,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
- lock_extent_bits(io_tree, start, end, &cached);
+ lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
- unlock_extent_cached(io_tree, start, end, &cached);
+ unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1333,10 +1333,10 @@ again:
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
if (!ordered)
break;
@@ -1616,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
return ret;
clear_extent_bit(&inode->io_tree, start, start + len - 1,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, cached_state);
+ EXTENT_DEFRAG, cached_state);
set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
/* Update the page status */
@@ -1666,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
wait_on_page_writeback(pages[i]);
/* Lock the pages range */
- lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1694,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
free_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i]) {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9063072b399b..0eab3cb274a1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -286,6 +286,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
}
/*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ if (!btrfs_try_tree_read_lock(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EAGAIN);
+ }
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
* DREW locks
* ==========
*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index ab268be09bb5..490c7a79e995 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 340f995652f2..f9850edfd726 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
return NULL;
}
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root: the root to search.
+ * @bytenr: bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr. If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+ u64 bytenr)
+{
+ struct rb_node *node = root->rb_node, *ret = NULL;
+ struct rb_simple_node *entry, *ret_entry = NULL;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr) {
+ if (!ret || entry->bytenr < ret_entry->bytenr) {
+ ret = node;
+ ret_entry = entry;
+ }
+
+ node = node->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ node = node->rb_right;
+ } else {
+ return node;
+ }
+ }
+
+ return ret;
+}
+
static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
struct rb_node *node)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1952ac85222c..e54f8280031f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -524,7 +524,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
bool pending;
+ bool freespace_inode;
+ /*
+ * If this is a free space inode the thread has not acquired the ordered
+ * extents lockdep map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
@@ -580,6 +588,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
}
+ btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
@@ -594,6 +604,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
+ if (!freespace_inode)
+ btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
}
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
@@ -712,10 +724,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
/*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ /*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
@@ -723,6 +742,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
@@ -1022,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent_bits(&inode->io_tree, start, end, cachedp);
+ lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1035,12 +1056,37 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(&inode->io_tree, start, end, cachedp);
+ unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, start, end))
+ return false;
+
+ ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+ if (!ordered)
+ return true;
+
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&inode->io_tree, start, end, NULL);
+
+ return false;
+}
+
+
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
u64 len)
{
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 87792f85e2c4..f59f2dbdb25e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -160,18 +160,6 @@ struct btrfs_ordered_extent {
struct block_device *bdev;
};
-/*
- * calculates the total size you need to allocate for an ordered sum
- * structure spanning 'bytes' in the file
- */
-static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
- unsigned long bytes)
-{
- int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
- return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
-}
-
static inline void
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
{
@@ -218,6 +206,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a2ec8ecae8de..055a631276ce 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -270,11 +270,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino = btrfs_ino(BTRFS_I(inode));
- int ret;
-
- ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
- return ret;
+ return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index db723c0026bd..9334c3157c22 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -275,7 +275,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
}
/*
- * Add relation specified by two qgoup ids.
+ * Add relation specified by two qgroup ids.
*
* Must be called with qgroup_lock held.
*
@@ -333,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
}
#endif
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+ fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+ BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -401,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
@@ -419,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
btrfs_err(fs_info, "inconsistent qgroup config");
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
qgroup = add_qgroup_rb(fs_info, found_key.offset);
@@ -878,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
- btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
@@ -1052,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
@@ -1174,6 +1183,21 @@ out_add_root:
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
@@ -1255,6 +1279,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -1717,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
ret = update_qgroup_limit_item(trans, qgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
@@ -1790,10 +1815,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
+ if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ return 0;
+
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
true);
if (ret < 0) {
- trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(trans->fs_info);
btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
@@ -2269,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2280,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
int level;
+ u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
@@ -2289,6 +2318,23 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
+ spin_lock(&fs_info->qgroup_lock);
+ drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * This function only gets called for snapshot drop, if we hit a high
+ * node here, it means we are going to change ownership for quite a lot
+ * of extents, which will greatly slow down btrfs_commit_transaction().
+ *
+ * So here if we find a high tree here, we just skip the accounting and
+ * mark qgroup inconsistent.
+ */
+ if (root_level >= drop_subptree_thres) {
+ qgroup_mark_inconsistent(fs_info);
+ return 0;
+ }
+
if (!extent_buffer_uptodate(root_eb)) {
ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
@@ -2604,7 +2650,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
if (new_roots) {
@@ -2700,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
- if (!ret) {
+ if (!ret && !(fs_info->qgroup_flags &
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
/*
* Old roots should be searched when inserting qgroup
* extent record
@@ -2773,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -2789,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2907,7 +2953,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
ret = update_qgroup_limit_item(trans, dstgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info,
"unable to update quota limit for %llu",
dstgroup->qgroupid);
@@ -3013,7 +3059,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -3286,7 +3332,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3351,7 +3398,8 @@ out:
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!stopped)
+ if (!stopped ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
@@ -3362,6 +3410,7 @@ out:
}
}
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3372,6 +3421,8 @@ out:
if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+ btrfs_info(fs_info, "qgroup scan cancelled");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
@@ -3434,6 +3485,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -4231,8 +4284,7 @@ out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -4319,7 +4371,7 @@ out:
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 0c4dd2a9af96..578c77e94200 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -100,6 +100,9 @@
* subtree rescan for them.
*/
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2feb5c20641a..f6395e8288d6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -275,7 +275,6 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -814,8 +813,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
- if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
/*
* Clear the data bitmap, as the rbio may be cached for later usage.
* do this before before unlock_stripe() so there will be no new bio
@@ -946,6 +943,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
+ btrfs_get_bioc(bioc);
rbio->bioc = bioc;
rbio->nr_pages = num_pages;
rbio->nr_sectors = num_sectors;
@@ -1813,15 +1811,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- btrfs_put_bioc(bioc);
ret = PTR_ERR(rbio);
- goto out_dec_counter;
+ goto fail;
}
rbio->operation = BTRFS_RBIO_WRITE;
rbio_add_bio(rbio, bio);
- rbio->generic_bio_cnt = 1;
-
/*
* don't plug on full rbios, just get them out the door
* as quickly as we can
@@ -1829,7 +1824,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
- goto out_dec_counter;
+ goto fail;
return;
}
@@ -1844,13 +1839,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
} else {
ret = __raid56_parity_write(rbio);
if (ret)
- goto out_dec_counter;
+ goto fail;
}
return;
-out_dec_counter:
- btrfs_bio_counter_dec(fs_info);
+fail:
bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
}
@@ -2198,18 +2192,11 @@ cleanup:
* of the drive.
*/
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- int mirror_num, bool generic_io)
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- if (generic_io) {
- ASSERT(bioc->mirror_num == mirror_num);
- btrfs_bio(bio)->mirror_num = mirror_num;
- } else {
- btrfs_get_bioc(bioc);
- }
-
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
@@ -2225,14 +2212,11 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bioc->map_type);
- kfree(rbio);
+ __free_raid_bio(rbio);
bio->bi_status = BLK_STS_IOERR;
goto out_end_bio;
}
- if (generic_io)
- rbio->generic_bio_cnt = 1;
-
/*
* Loop retry:
* for 'mirror == 2', reconstruct from all other stripes.
@@ -2261,8 +2245,6 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
return;
out_end_bio:
- btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
bio_endio(bio);
}
@@ -2326,13 +2308,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
ASSERT(i < rbio->real_stripes);
bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
-
- /*
- * We have already increased bio_counter when getting bioc, record it
- * so we can free it at rbio_orig_end_io().
- */
- rbio->generic_bio_cnt = 1;
-
return rbio;
}
@@ -2772,12 +2747,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
return NULL;
}
- /*
- * When we get bioc, we have already increased bio_counter, record it
- * so we can free it at rbio_orig_end_io()
- */
- rbio->generic_bio_cnt = 1;
-
return rbio;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 6f48f9e4c869..91d5c0adad15 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -89,8 +89,6 @@ struct btrfs_raid_bio {
*/
int bio_list_bytes;
- int generic_bio_cnt;
-
refcount_t refs;
atomic_t stripes_pending;
@@ -166,7 +164,7 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_device;
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- int mirror_num, bool generic_io);
+ int mirror_num);
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9acf47b11fe6..f50586ff85c8 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -92,7 +92,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, NULL);
+ NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -615,8 +615,8 @@ out:
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
}
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
@@ -634,8 +634,8 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
swap(range1_end, range2_end);
}
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 45c02aba2492..666a37a0ee89 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1124,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (!ret)
continue;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- key.offset, end, 1);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end, NULL);
}
}
@@ -1566,9 +1566,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
}
return 0;
}
@@ -2869,13 +2869,13 @@ static noinline_for_stack int prealloc_file_extent_cluster(
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end, NULL);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (ret)
break;
}
@@ -2890,7 +2890,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
u64 start, u64 end, u64 block_start)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -2904,18 +2903,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ free_extent_map(em);
+
return ret;
}
@@ -3006,7 +2998,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, NULL);
if (ret) {
@@ -3039,7 +3031,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -4339,7 +4331,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + inode->index_cnt;
csum_root = btrfs_csum_root(fs_info, disk_bytenr);
ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0);
+ disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index d647cb2938c0..e1f599d7a916 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -337,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
- int err = 0;
int ret;
path = btrfs_alloc_path();
@@ -350,7 +349,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret == 0) {
leaf = path->nodes[0];
@@ -360,18 +358,18 @@ again:
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name_len) ||
memcmp_extent_buffer(leaf, name, ptr, name_len)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- } else
- err = -ENOENT;
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -383,7 +381,7 @@ again:
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
/*
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3afe5fa50a63..f260c53829e5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -54,6 +54,8 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
@@ -62,16 +64,12 @@ struct scrub_recover {
struct scrub_sector {
struct scrub_block *sblock;
- struct page *page;
- struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
- u64 logical;
- u64 physical;
- u64 physical_for_dev_replace;
+ /* Offset in bytes to @sblock. */
+ u32 offset;
atomic_t refs;
- u8 mirror_num;
unsigned int have_csum:1;
unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
@@ -94,8 +92,22 @@ struct scrub_bio {
};
struct scrub_block {
+ /*
+ * Each page will have its page::private used to record the logical
+ * bytenr.
+ */
+ struct page *pages[SCRUB_MAX_PAGES];
struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ struct btrfs_device *dev;
+ /* Logical bytenr of the sblock */
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ /* Length of sblock in bytes */
+ u32 len;
int sector_count;
+ int mirror_num;
+
atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
@@ -202,8 +214,174 @@ struct full_stripe_lock {
struct mutex mutex;
};
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+ u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+ attach_page_private(page, (void *)logical);
+ return 0;
+#else
+ struct scrub_page_private *spp;
+
+ spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+ if (!spp)
+ return -ENOMEM;
+ spp->logical = logical;
+ attach_page_private(page, (void *)spp);
+ return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+ detach_page_private(page);
+ return;
+#else
+ struct scrub_page_private *spp;
+
+ spp = detach_page_private(page);
+ kfree(spp);
+ return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+ struct btrfs_device *dev,
+ u64 logical, u64 physical,
+ u64 physical_for_dev_replace,
+ int mirror_num)
+{
+ struct scrub_block *sblock;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ if (!sblock)
+ return NULL;
+ refcount_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->logical = logical;
+ sblock->physical = physical;
+ sblock->physical_for_dev_replace = physical_for_dev_replace;
+ sblock->dev = dev;
+ sblock->mirror_num = mirror_num;
+ sblock->no_io_error_seen = 1;
+ /*
+ * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+ * the corresponding page is not allocated.
+ */
+ return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+ u64 logical, gfp_t gfp)
+{
+ const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+ struct scrub_sector *ssector;
+
+ /* We must never have scrub_block exceed U32_MAX in size. */
+ ASSERT(logical - sblock->logical < U32_MAX);
+
+ ssector = kzalloc(sizeof(*ssector), gfp);
+ if (!ssector)
+ return NULL;
+
+ /* Allocate a new page if the slot is not allocated */
+ if (!sblock->pages[page_index]) {
+ int ret;
+
+ sblock->pages[page_index] = alloc_page(gfp);
+ if (!sblock->pages[page_index]) {
+ kfree(ssector);
+ return NULL;
+ }
+ ret = attach_scrub_page_private(sblock->pages[page_index],
+ sblock->logical + (page_index << PAGE_SHIFT));
+ if (ret < 0) {
+ kfree(ssector);
+ __free_page(sblock->pages[page_index]);
+ sblock->pages[page_index] = NULL;
+ return NULL;
+ }
+ }
+
+ atomic_set(&ssector->refs, 1);
+ ssector->sblock = sblock;
+ /* The sector to be added should not be used */
+ ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+ ssector->offset = logical - sblock->logical;
+
+ /* The sector count must be smaller than the limit */
+ ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ sblock->sectors[sblock->sector_count] = ssector;
+ sblock->sector_count++;
+ sblock->len += sblock->sctx->fs_info->sectorsize;
+
+ return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+ pgoff_t index;
+ /*
+ * When calling this function, ssector must be alreaday attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ index = ssector->offset >> PAGE_SHIFT;
+ ASSERT(index < SCRUB_MAX_PAGES);
+ ASSERT(sblock->pages[index]);
+ ASSERT(PagePrivate(sblock->pages[index]));
+ return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+
+ /*
+ * When calling this function, ssector must be already attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+ return page_address(scrub_sector_get_page(ssector)) +
+ scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+ unsigned int len)
+{
+ return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+ scrub_sector_get_page_offset(ssector));
+}
+
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck);
+ struct scrub_block *sblocks_for_recheck[]);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
@@ -533,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->sector_count; i++) {
- WARN_ON(!sbio->sectors[i]->page);
+ for (i = 0; i < sbio->sector_count; i++)
scrub_block_put(sbio->sectors[i]->sblock);
- }
bio_put(sbio->bio);
}
@@ -726,15 +902,22 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
int ret;
WARN_ON(sblock->sector_count < 1);
- dev = sblock->sectors[0]->dev;
+ dev = sblock->dev;
fs_info = sblock->sctx->fs_info;
+ /* Super block error, no need to search extent tree. */
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ errstr, rcu_str_deref(dev->name),
+ sblock->physical);
+ return;
+ }
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->sectors[0]->physical;
- swarn.logical = sblock->sectors[0]->logical;
+ swarn.physical = sblock->physical;
+ swarn.logical = sblock->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -804,13 +987,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
- struct btrfs_device *dev;
+ struct btrfs_device *dev = sblock_to_check->dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ /* One scrub_block for each mirror */
+ struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
@@ -825,22 +1009,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
fs_info = sctx->fs_info;
if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
- * if we find an error in a super block, we just report it.
+ * If we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
+ scrub_print_warning("super block error", sblock_to_check);
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
return 0;
}
- logical = sblock_to_check->sectors[0]->logical;
- BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
+ logical = sblock_to_check->logical;
+ ASSERT(sblock_to_check->mirror_num);
+ failed_mirror_index = sblock_to_check->mirror_num - 1;
is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->sectors[0]->have_csum;
- dev = sblock_to_check->sectors[0]->dev;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -902,17 +1087,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* repaired area is verified in order to correctly maintain
* the statistics.
*/
-
- sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_KERNEL);
- if (!sblocks_for_recheck) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ /*
+ * Note: the two members refs and outstanding_sectors are not
+ * used in the blocks that are used for the recheck procedure.
+ *
+ * But alloc_scrub_block() will initialize sblock::ref anyway,
+ * so we can use scrub_block_put() to clean them up.
+ *
+ * And here we don't setup the physical/dev for the sblock yet,
+ * they will be correctly initialized in scrub_setup_recheck_block().
+ */
+ sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+ logical, 0, 0, mirror_index);
+ if (!sblocks_for_recheck[mirror_index]) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
}
/* Setup the context, map the logical blocks and alloc the sectors */
@@ -926,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
- sblock_bad = sblocks_for_recheck + failed_mirror_index;
+ sblock_bad = sblocks_for_recheck[failed_mirror_index];
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
@@ -1011,22 +1207,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].sector_count)
+ if (!sblocks_for_recheck[mirror_index]->sector_count)
break;
- sblock_other = sblocks_for_recheck + mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
} else {
struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].sector_count)
+ if (!sblocks_for_recheck[1]->sector_count)
break;
ASSERT(failed_mirror_index == 0);
- sblock_other = sblocks_for_recheck + 1;
- sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
+ sblock_other = sblocks_for_recheck[1];
+ sblock_other->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1097,12 +1293,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].sector_count > 0;
+ sblocks_for_recheck[mirror_index]->sector_count > 0;
mirror_index++) {
- if (!sblocks_for_recheck[mirror_index].
+ if (!sblocks_for_recheck[mirror_index]->
sectors[sector_num]->io_error) {
- sblock_other = sblocks_for_recheck +
- mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
break;
}
}
@@ -1176,25 +1371,28 @@ did_not_correct_error:
}
out:
- if (sblocks_for_recheck) {
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
- mirror_index++) {
- struct scrub_block *sblock = sblocks_for_recheck +
- mirror_index;
- struct scrub_recover *recover;
- int i;
-
- for (i = 0; i < sblock->sector_count; i++) {
- sblock->sectors[i]->sblock = NULL;
- recover = sblock->sectors[i]->recover;
- if (recover) {
- scrub_put_recover(fs_info, recover);
- sblock->sectors[i]->recover = NULL;
- }
- scrub_sector_put(sblock->sectors[i]);
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+ struct scrub_recover *recover;
+ int sector_index;
+
+ /* Not allocated, continue checking the next mirror */
+ if (!sblock)
+ continue;
+
+ for (sector_index = 0; sector_index < sblock->sector_count;
+ sector_index++) {
+ /*
+ * Here we just cleanup the recover, each sector will be
+ * properly cleaned up by later scrub_block_put()
+ */
+ recover = sblock->sectors[sector_index]->recover;
+ if (recover) {
+ scrub_put_recover(fs_info, recover);
+ sblock->sectors[sector_index]->recover = NULL;
}
}
- kfree(sblocks_for_recheck);
+ scrub_block_put(sblock);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
@@ -1244,12 +1442,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck)
+ struct scrub_block *sblocks_for_recheck[])
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 logical = original_sblock->logical;
u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
- u64 logical = original_sblock->sectors[0]->logical;
u64 generation = original_sblock->sectors[0]->generation;
u64 flags = original_sblock->sectors[0]->flags;
u64 have_csum = original_sblock->sectors[0]->have_csum;
@@ -1264,11 +1462,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
int nmirrors;
int ret;
- /*
- * Note: the two members refs and outstanding_sectors are not used (and
- * not set) in the blocks that are used for the recheck procedure.
- */
-
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
@@ -1307,24 +1500,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblock;
struct scrub_sector *sector;
- sblock = sblocks_for_recheck + mirror_index;
+ sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
- sector = kzalloc(sizeof(*sector), GFP_NOFS);
+ sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_sector_get(sector);
- sblock->sectors[sector_index] = sector;
- sector->sblock = sblock;
sector->flags = flags;
sector->generation = generation;
- sector->logical = logical;
sector->have_csum = have_csum;
if (have_csum)
memcpy(sector->csum,
@@ -1339,21 +1527,20 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- sector->physical = bioc->stripes[stripe_index].physical +
- stripe_offset;
- sector->dev = bioc->stripes[stripe_index].dev;
+ /*
+ * We're at the first sector, also populate @sblock
+ * physical and dev.
+ */
+ if (sector_index == 0) {
+ sblock->physical =
+ bioc->stripes[stripe_index].physical +
+ stripe_offset;
+ sblock->dev = bioc->stripes[stripe_index].dev;
+ sblock->physical_for_dev_replace =
+ original_sblock->physical_for_dev_replace;
+ }
BUG_ON(sector_index >= original_sblock->sector_count);
- sector->physical_for_dev_replace =
- original_sblock->sectors[sector_index]->
- physical_for_dev_replace;
- /* For missing devices, dev->bdev is NULL */
- sector->mirror_num = mirror_index + 1;
- sblock->sector_count++;
- sector->page = alloc_page(GFP_NOFS);
- if (!sector->page)
- goto leave_nomem;
-
scrub_get_recover(recover);
sector->recover = recover;
}
@@ -1377,11 +1564,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
{
DECLARE_COMPLETION_ONSTACK(done);
- bio->bi_iter.bi_sector = sector->logical >> 9;
+ bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+ SECTOR_SHIFT;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- raid56_parity_recover(bio, sector->recover->bioc,
- sector->sblock->sectors[0]->mirror_num, false);
+ raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -1395,17 +1582,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
int i;
/* All sectors in sblock belong to the same stripe on the same device. */
- ASSERT(first_sector->dev);
- if (!first_sector->dev->bdev)
+ ASSERT(sblock->dev);
+ if (!sblock->dev->bdev)
goto out;
- bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!sector->page);
- bio_add_page(bio, sector->page, PAGE_SIZE, 0);
+ bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
@@ -1449,16 +1635,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct bio bio;
struct bio_vec bvec;
- if (sector->dev->bdev == NULL) {
+ if (sblock->dev->bdev == NULL) {
sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!sector->page);
- bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
- bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
- bio.bi_iter.bi_sector = sector->physical >> 9;
+ bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+ bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+ SECTOR_SHIFT;
btrfsic_check_bio(&bio);
if (submit_bio_wait(&bio)) {
@@ -1475,7 +1661,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1521,30 +1707,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(sector_bad->page == NULL);
- BUG_ON(sector_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || sector_bad->io_error) {
struct bio bio;
struct bio_vec bvec;
int ret;
- if (!sector_bad->dev->bdev) {
+ if (!sblock_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
- bio.bi_iter.bi_sector = sector_bad->physical >> 9;
- __bio_add_page(&bio, sector_good->page, sectorsize, 0);
+ bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = (sblock_bad->physical +
+ sector_bad->offset) >> SECTOR_SHIFT;
+ ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
bio_uninit(&bio);
if (ret) {
- btrfs_dev_stat_inc_and_print(sector_bad->dev,
+ btrfs_dev_stat_inc_and_print(sblock_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
return -EIO;
@@ -1577,11 +1762,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
{
+ const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
struct scrub_sector *sector = sblock->sectors[sector_num];
- BUG_ON(sector->page == NULL);
if (sector->io_error)
- clear_page(page_address(sector->page));
+ memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
}
@@ -1608,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ refcount_inc(&sblock->refs);
+}
+
static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_sector *sector)
{
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
int ret;
const u32 sectorsize = sctx->fs_info->sectorsize;
@@ -1629,14 +1820,15 @@ again:
}
sbio = sctx->wr_curr_bio;
if (sbio->sector_count == 0) {
- ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
+ ret = fill_writer_pointer_gap(sctx, sector->offset +
+ sblock->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
- sbio->physical = sector->physical_for_dev_replace;
- sbio->logical = sector->logical;
+ sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
sbio->dev = sctx->wr_tgtdev;
if (!sbio->bio) {
sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
@@ -1647,14 +1839,14 @@ again:
sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
} else if (sbio->physical + sbio->sector_count * sectorsize !=
- sector->physical_for_dev_replace ||
+ sblock->physical_for_dev_replace + sector->offset ||
sbio->logical + sbio->sector_count * sectorsize !=
- sector->logical) {
+ sblock->logical + sector->offset) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
if (sbio->sector_count < 1) {
bio_put(sbio->bio);
@@ -1668,6 +1860,13 @@ again:
sbio->sectors[sbio->sector_count] = sector;
scrub_sector_get(sector);
+ /*
+ * Since ssector no longer holds a page, but uses sblock::pages, we
+ * have to ensure the sblock had not been freed before our write bio
+ * finished.
+ */
+ scrub_block_get(sector->sblock);
+
sbio->sector_count++;
if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
@@ -1729,8 +1928,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work)
}
}
- for (i = 0; i < sbio->sector_count; i++)
+ /*
+ * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+ * endio we should put the sblock.
+ */
+ for (i = 0; i < sbio->sector_count; i++) {
+ scrub_block_put(sbio->sectors[i]->sblock);
scrub_sector_put(sbio->sectors[i]);
+ }
bio_put(sbio->bio);
kfree(sbio);
@@ -1762,7 +1967,7 @@ static int scrub_checksum(struct scrub_block *sblock)
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
- (void)scrub_checksum_super(sblock);
+ ret = scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
@@ -1785,15 +1990,11 @@ static int scrub_checksum_data(struct scrub_block *sblock)
if (!sector->have_csum)
return 0;
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- /*
- * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
- * only contains one sector of data.
- */
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
if (memcmp(csum, sector->csum, fs_info->csum_size))
@@ -1826,7 +2027,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
ASSERT(sblock->sector_count == num_sectors);
sector = sblock->sectors[0];
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
@@ -1835,7 +2036,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sector->logical != btrfs_stack_header_bytenr(h))
+ if (sblock->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
if (sector->generation != btrfs_stack_header_generation(h)) {
@@ -1856,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
- kaddr = page_address(sblock->sectors[i]->page);
+ kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
crypto_shash_update(shash, kaddr, sectorsize);
}
@@ -1881,10 +2082,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
BUG_ON(sblock->sector_count < 1);
sector = sblock->sectors[0];
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
s = (struct btrfs_super_block *)kaddr;
- if (sector->logical != btrfs_super_bytenr(s))
+ if (sblock->logical != btrfs_super_bytenr(s))
++fail_cor;
if (sector->generation != btrfs_super_generation(s))
@@ -1901,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
- if (fail_cor + fail_gen) {
- /*
- * if we find an error in a super block, we just report it.
- * They will get written with the next transaction commit
- * anyway
- */
- spin_lock(&sctx->stat_lock);
- ++sctx->stat.super_errors;
- spin_unlock(&sctx->stat_lock);
- if (fail_cor)
- btrfs_dev_stat_inc_and_print(sector->dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- else
- btrfs_dev_stat_inc_and_print(sector->dev,
- BTRFS_DEV_STAT_GENERATION_ERRS);
- }
-
return fail_cor + fail_gen;
}
-static void scrub_block_get(struct scrub_block *sblock)
-{
- refcount_inc(&sblock->refs);
-}
-
static void scrub_block_put(struct scrub_block *sblock)
{
if (refcount_dec_and_test(&sblock->refs)) {
@@ -1936,6 +2115,12 @@ static void scrub_block_put(struct scrub_block *sblock)
for (i = 0; i < sblock->sector_count; i++)
scrub_sector_put(sblock->sectors[i]);
+ for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+ if (sblock->pages[i]) {
+ detach_scrub_page_private(sblock->pages[i]);
+ __free_page(sblock->pages[i]);
+ }
+ }
kfree(sblock);
}
}
@@ -1947,11 +2132,8 @@ static void scrub_sector_get(struct scrub_sector *sector)
static void scrub_sector_put(struct scrub_sector *sector)
{
- if (atomic_dec_and_test(&sector->refs)) {
- if (sector->page)
- __free_page(sector->page);
+ if (atomic_dec_and_test(&sector->refs))
kfree(sector);
- }
}
/*
@@ -2056,9 +2238,9 @@ again:
}
sbio = sctx->bios[sctx->curr];
if (sbio->sector_count == 0) {
- sbio->physical = sector->physical;
- sbio->logical = sector->logical;
- sbio->dev = sector->dev;
+ sbio->physical = sblock->physical + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sblock->dev;
if (!sbio->bio) {
sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
REQ_OP_READ, GFP_NOFS);
@@ -2068,16 +2250,16 @@ again:
sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
} else if (sbio->physical + sbio->sector_count * sectorsize !=
- sector->physical ||
+ sblock->physical + sector->offset ||
sbio->logical + sbio->sector_count * sectorsize !=
- sector->logical ||
- sbio->dev != sector->dev) {
+ sblock->logical + sector->offset ||
+ sbio->dev != sblock->dev) {
scrub_submit(sctx);
goto again;
}
sbio->sectors[sbio->sector_count] = sector;
- ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
if (sbio->sector_count < 1) {
bio_put(sbio->bio);
@@ -2102,6 +2284,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+ btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
sblock->no_io_error_seen = 0;
@@ -2118,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->sectors[0]->logical;
- dev = sblock->sectors[0]->dev;
+ logical = sblock->logical;
+ dev = sblock->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2157,7 +2340,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->sector_count << fs_info->sectorsize_bits;
- u64 logical = sblock->sectors[0]->logical;
+ u64 logical = sblock->logical;
struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
@@ -2193,17 +2376,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
- /*
- * For now, our scrub is still one page per sector, so pgoff
- * is always 0.
- */
- raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
+ raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+ scrub_sector_get_page_offset(sector),
+ sector->offset + sector->sblock->logical);
}
INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
+ btrfs_put_bioc(bioc);
return;
rbio_out:
@@ -2225,7 +2407,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical,
+ physical_for_dev_replace, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2233,12 +2416,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
-
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
/*
@@ -2248,36 +2425,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
- scrub_sector_get(sector);
- sblock->sectors[index] = sector;
- sector->sblock = sblock;
- sector->dev = dev;
sector->flags = flags;
sector->generation = gen;
- sector->logical = logical;
- sector->physical = physical;
- sector->physical_for_dev_replace = physical_for_dev_replace;
- sector->mirror_num = mirror_num;
if (csum) {
sector->have_csum = 1;
memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
sector->have_csum = 0;
}
- sblock->sector_count++;
- sector->page = alloc_page(GFP_KERNEL);
- if (!sector->page)
- goto leave_nomem;
len -= l;
logical += l;
physical += l;
@@ -2423,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->sectors[0]->logical;
- u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
+ u64 start = sblock->logical;
+ u64 end = sblock->logical +
+ sblock->sectors[sblock->sector_count - 1]->offset +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
@@ -2508,11 +2672,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ /*
+ * Block size determines how many scrub_block will be allocated. Here
+ * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't
+ * allocate too many scrub_block, while still won't cause too large
+ * bios for large extents.
+ */
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
- blocksize = sctx->fs_info->sectorsize;
+ blocksize = BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
@@ -2578,7 +2748,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
ASSERT(IS_ALIGNED(len, sectorsize));
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2586,51 +2756,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
- sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
- /* For scrub block */
- scrub_sector_get(sector);
sblock->sectors[index] = sector;
/* For scrub parity */
scrub_sector_get(sector);
list_add_tail(&sector->list, &sparity->sectors_list);
- sector->sblock = sblock;
- sector->dev = dev;
sector->flags = flags;
sector->generation = gen;
- sector->logical = logical;
- sector->physical = physical;
- sector->mirror_num = mirror_num;
if (csum) {
sector->have_csum = 1;
memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
sector->have_csum = 0;
}
- sblock->sector_count++;
- sector->page = alloc_page(GFP_KERNEL);
- if (!sector->page)
- goto leave_nomem;
-
/* Iterate over the stripe range in sectorsize steps */
len -= sectorsize;
@@ -2774,6 +2925,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work)
work);
struct scrub_ctx *sctx = sparity->sctx;
+ btrfs_bio_counter_dec(sctx->fs_info);
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
}
@@ -2824,6 +2976,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
sparity->scrub_dev,
&sparity->dbitmap,
sparity->nsectors);
+ btrfs_put_bioc(bioc);
if (!rbio)
goto rbio_out;
@@ -2835,7 +2988,6 @@ rbio_out:
bio_put(bio);
bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -3077,7 +3229,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
ret = btrfs_lookup_csums_range(csum_root, extent_start,
extent_start + extent_size - 1,
- &sctx->csum_list, 1);
+ &sctx->csum_list, 1, false);
if (ret) {
scrub_parity_mark_sectors_error(sparity, extent_start,
extent_size);
@@ -3266,7 +3418,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Block group removed? */
spin_lock(&bg->lock);
- if (bg->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
spin_unlock(&bg->lock);
ret = 0;
break;
@@ -3303,7 +3455,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
ret = btrfs_lookup_csums_range(csum_root, cur_logical,
cur_logical + scrub_len - 1,
- &sctx->csum_list, 1);
+ &sctx->csum_list, 1, false);
if (ret)
break;
}
@@ -3606,7 +3758,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* kthread or relocation.
*/
spin_lock(&bg->lock);
- if (!bg->removed)
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
ret = -EINVAL;
spin_unlock(&bg->lock);
@@ -3764,13 +3916,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
}
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
- spin_lock(&cache->lock);
- if (!cache->to_copy) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
- spin_unlock(&cache->lock);
}
/*
@@ -3782,7 +3932,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* repair extents.
*/
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
@@ -3942,8 +4092,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* balance is triggered or it becomes used and unused again.
*/
spin_lock(&cache->lock);
- if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- cache->used == 0) {
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+ !cache->ro && cache->reserved == 0 && cache->used == 0) {
spin_unlock(&cache->lock);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
@@ -4102,36 +4252,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
- if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
- /*
- * in this case scrub is unable to calculate the checksum
- * the way scrub is implemented. Do not handle this
- * situation at all because it won't ever happen.
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
- fs_info->nodesize,
- BTRFS_STRIPE_LEN);
- return -EINVAL;
- }
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
- if (fs_info->nodesize >
- SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
- /*
- * Would exhaust the array bounds of sectorv member in
- * struct scrub_block
- */
- btrfs_err(fs_info,
-"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
- fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
- return -EINVAL;
- }
+ /*
+ * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+ * value (max nodesize / min sectorsize), thus nodesize should always
+ * be fine.
+ */
+ ASSERT(fs_info->nodesize <=
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
/* Allocate outside of device_list_mutex */
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
@@ -4205,6 +4340,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -4213,6 +4354,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -4239,6 +4390,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e7671afcee4f..ec6e1752af2c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -15,6 +15,7 @@
#include <linux/string.h>
#include <linux/compat.h>
#include <linux/crc32c.h>
+#include <linux/fsverity.h>
#include "send.h"
#include "ctree.h"
@@ -127,6 +128,8 @@ struct send_ctx {
bool cur_inode_new_gen;
bool cur_inode_deleted;
bool ignore_cur_inode;
+ bool cur_inode_needs_verity;
+ void *verity_descriptor;
u64 send_progress;
@@ -345,6 +348,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
switch (sctx->proto) {
case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
+ case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
default: return false;
}
}
@@ -624,6 +628,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(8)
TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
@@ -842,17 +847,32 @@ out:
return ret;
}
+struct btrfs_inode_info {
+ u64 size;
+ u64 gen;
+ u64 mode;
+ u64 uid;
+ u64 gid;
+ u64 rdev;
+ u64 fileattr;
+ u64 nlink;
+};
+
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
- u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev, u64 *fileattr)
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+ struct btrfs_inode_info *info)
{
int ret;
+ struct btrfs_path *path;
struct btrfs_inode_item *ii;
struct btrfs_key key;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
@@ -860,47 +880,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- return ret;
+ goto out;
}
+ if (!info)
+ goto out;
+
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- if (size)
- *size = btrfs_inode_size(path->nodes[0], ii);
- if (gen)
- *gen = btrfs_inode_generation(path->nodes[0], ii);
- if (mode)
- *mode = btrfs_inode_mode(path->nodes[0], ii);
- if (uid)
- *uid = btrfs_inode_uid(path->nodes[0], ii);
- if (gid)
- *gid = btrfs_inode_gid(path->nodes[0], ii);
- if (rdev)
- *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->size = btrfs_inode_size(path->nodes[0], ii);
+ info->gen = btrfs_inode_generation(path->nodes[0], ii);
+ info->mode = btrfs_inode_mode(path->nodes[0], ii);
+ info->uid = btrfs_inode_uid(path->nodes[0], ii);
+ info->gid = btrfs_inode_gid(path->nodes[0], ii);
+ info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
/*
* Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
* otherwise logically split to 32/32 parts.
*/
- if (fileattr)
- *fileattr = btrfs_inode_flags(path->nodes[0], ii);
+ info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
+out:
+ btrfs_free_path(path);
return ret;
}
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev, u64 *fileattr)
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
- struct btrfs_path *path;
int ret;
+ struct btrfs_inode_info info;
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
- ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev, fileattr);
- btrfs_free_path(path);
+ if (!gen)
+ return -EPERM;
+
+ ret = get_inode_info(root, ino, &info);
+ if (!ret)
+ *gen = info.gen;
return ret;
}
@@ -1643,21 +1659,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
int right_ret;
u64 left_gen;
u64 right_gen;
+ struct btrfs_inode_info info;
- ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- left_ret = ret;
+ left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ left_gen = info.gen;
if (!sctx->parent_root) {
right_ret = -ENOENT;
} else {
- ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- right_ret = ret;
+ right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ right_gen = info.gen;
}
if (!left_ret && !right_ret) {
@@ -1816,8 +1833,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
btrfs_release_path(path);
if (dir_gen) {
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
goto out;
}
@@ -1874,6 +1890,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
+ struct btrfs_inode_info info;
if (!sctx->parent_root)
goto out;
@@ -1888,8 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* and we can just unlink this entry.
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1916,13 +1932,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
- ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
goto out;
ret = 1;
*who_ino = other_inode;
+ *who_gen = info.gen;
+ *who_mode = info.mode;
} else {
ret = 0;
}
@@ -1955,8 +1972,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1978,8 +1994,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
if (ret < 0)
goto out;
@@ -2645,6 +2660,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
int ret = 0;
struct fs_path *p;
int cmd;
+ struct btrfs_inode_info info;
u64 gen;
u64 mode;
u64 rdev;
@@ -2656,10 +2672,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
if (ino != sctx->cur_ino) {
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0)
goto out;
+ gen = info.gen;
+ mode = info.mode;
+ rdev = info.rdev;
} else {
gen = sctx->cur_inode_gen;
mode = sctx->cur_inode_mode;
@@ -3359,8 +3377,7 @@ finish:
/*
* The parent inode might have been deleted in the send snapshot
*/
- ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3534,12 +3551,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
goto out;
- ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3669,8 +3684,7 @@ static int is_ancestor(struct btrfs_root *root,
cur_offset = item_size;
}
- ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent, &parent_gen);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3760,9 +3774,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
memcmp(path_before->start, path_after->start, len1))) {
u64 parent_ino_gen;
- ret = get_inode_info(sctx->parent_root, ino, NULL,
- &parent_ino_gen, NULL, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -4441,8 +4453,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index,
struct recorded_ref *ref;
u64 dir_gen;
- ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
goto out;
@@ -4472,8 +4483,7 @@ static int record_deleted_ref_if_needed(int num, u64 dir, int index,
struct recorded_ref *ref;
u64 dir_gen;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
goto out;
@@ -4886,6 +4896,84 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
return ret;
}
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+ struct fsverity_descriptor *desc)
+{
+ int ret;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+ le8_to_cpu(desc->hash_algorithm));
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+ 1U << le8_to_cpu(desc->log_blocksize));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+ le8_to_cpu(desc->salt_size));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+ le32_to_cpu(desc->sig_size));
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct inode *inode;
+ struct fs_path *p;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ if (ret < 0)
+ goto iput;
+
+ if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ ret = -EMSGSIZE;
+ goto iput;
+ }
+ if (!sctx->verity_descriptor) {
+ sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+ GFP_KERNEL);
+ if (!sctx->verity_descriptor) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ }
+
+ ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ if (ret < 0)
+ goto iput;
+
+ p = fs_path_alloc();
+ if (!p) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto free_path;
+
+ ret = send_verity(sctx, p, sctx->verity_descriptor);
+ if (ret < 0)
+ goto free_path;
+
+free_path:
+ fs_path_free(p);
+iput:
+ iput(inode);
+ return ret;
+}
+
static inline u64 max_send_read_size(const struct send_ctx *sctx)
{
return sctx->send_max_size - SZ_16K;
@@ -5056,8 +5144,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
if (clone_root->root == sctx->send_root) {
- ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5536,6 +5623,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ struct btrfs_inode_info info;
u64 clone_src_i_size = 0;
/*
@@ -5565,12 +5653,11 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL,
- NULL);
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
goto out;
+ clone_src_i_size = info.size;
/*
* We can't send a clone operation for the entire range if we find
@@ -6259,6 +6346,7 @@ out:
static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
{
int ret = 0;
+ struct btrfs_inode_info info;
u64 left_mode;
u64 left_uid;
u64 left_gid;
@@ -6301,11 +6389,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
goto out;
-
- ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL, &left_fileattr);
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ left_mode = info.mode;
+ left_uid = info.uid;
+ left_gid = info.gid;
+ left_fileattr = info.fileattr;
if (!sctx->parent_root || sctx->cur_inode_new) {
need_chown = 1;
@@ -6316,11 +6406,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
} else {
u64 old_size;
- ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- &old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL, &right_fileattr);
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ old_size = info.size;
+ right_mode = info.mode;
+ right_uid = info.uid;
+ right_gid = info.gid;
+ right_fileattr = info.fileattr;
if (left_uid != right_uid || left_gid != right_gid)
need_chown = 1;
@@ -6378,6 +6471,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+ && sctx->cur_inode_needs_verity) {
+ ret = process_verity(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
ret = send_capabilities(sctx);
if (ret < 0)
goto out;
@@ -6407,86 +6507,6 @@ out:
return ret;
}
-struct parent_paths_ctx {
- struct list_head *refs;
- struct send_ctx *sctx;
-};
-
-static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
- void *ctx)
-{
- struct parent_paths_ctx *ppctx = ctx;
-
- /*
- * Pass 0 as the generation for the directory, we don't care about it
- * here as we have no new references to add, we just want to delete all
- * references for an inode.
- */
- return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs,
- name, dir, 0, ppctx->sctx);
-}
-
-/*
- * Issue unlink operations for all paths of the current inode found in the
- * parent snapshot.
- */
-static int btrfs_unlink_all_paths(struct send_ctx *sctx)
-{
- LIST_HEAD(deleted_refs);
- struct btrfs_path *path;
- struct btrfs_root *root = sctx->parent_root;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct parent_paths_ctx ctx;
- int iter_ret = 0;
- int ret;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
-
- ctx.refs = &deleted_refs;
- ctx.sctx = sctx;
-
- btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
- if (found_key.objectid != key.objectid)
- break;
- if (found_key.type != key.type &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)
- break;
-
- ret = iterate_inode_ref(root, path, &found_key, 1,
- record_parent_ref, &ctx);
- if (ret < 0)
- goto out;
- }
- /* Catch error found during iteration */
- if (iter_ret < 0) {
- ret = iter_ret;
- goto out;
- }
-
- while (!list_empty(&deleted_refs)) {
- struct recorded_ref *ref;
-
- ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
- ret = send_unlink(sctx, ref->full_path);
- if (ret < 0)
- goto out;
- recorded_ref_free(ref);
- }
- ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- __free_recorded_refs(&deleted_refs);
- return ret;
-}
-
static void close_current_inode(struct send_ctx *sctx)
{
u64 i_size;
@@ -6577,25 +6597,36 @@ static int changed_inode(struct send_ctx *sctx,
* file descriptor against it or turning a RO snapshot into RW mode,
* keep an open file descriptor against a file, delete it and then
* turn the snapshot back to RO mode before using it for a send
- * operation. So if we find such cases, ignore the inode and all its
- * items completely if it's a new inode, or if it's a changed inode
- * make sure all its previous paths (from the parent snapshot) are all
- * unlinked and all other the inode items are ignored.
+ * operation. The former is what the receiver operation does.
+ * Therefore, if we want to send these snapshots soon after they're
+ * received, we need to handle orphan inodes as well. Moreover, orphans
+ * can appear not only in the send snapshot but also in the parent
+ * snapshot. Here are several cases:
+ *
+ * Case 1: BTRFS_COMPARE_TREE_NEW
+ * | send snapshot | action
+ * --------------------------------
+ * nlink | 0 | ignore
+ *
+ * Case 2: BTRFS_COMPARE_TREE_DELETED
+ * | parent snapshot | action
+ * ----------------------------------
+ * nlink | 0 | as usual
+ * Note: No unlinks will be sent because there're no paths for it.
+ *
+ * Case 3: BTRFS_COMPARE_TREE_CHANGED
+ * | | parent snapshot | send snapshot | action
+ * -----------------------------------------------------------------------
+ * subcase 1 | nlink | 0 | 0 | ignore
+ * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
+ * subcase 3 | nlink | 0 | >0 | new_gen(creation)
+ *
*/
- if (result == BTRFS_COMPARE_TREE_NEW ||
- result == BTRFS_COMPARE_TREE_CHANGED) {
- u32 nlinks;
-
- nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
- if (nlinks == 0) {
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
sctx->ignore_cur_inode = true;
- if (result == BTRFS_COMPARE_TREE_CHANGED)
- ret = btrfs_unlink_all_paths(sctx);
goto out;
}
- }
-
- if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = true;
sctx->cur_inode_deleted = false;
@@ -6616,6 +6647,16 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->right_path->nodes[0], right_ii);
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 new_nlinks, old_nlinks;
+
+ new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+ if (new_nlinks == 0 && old_nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ } else if (new_nlinks == 0 || old_nlinks == 0) {
+ sctx->cur_inode_new_gen = 1;
+ }
/*
* We need to do some special handling in case the inode was
* reported as changed with a changed generation number. This
@@ -6642,38 +6683,44 @@ static int changed_inode(struct send_ctx *sctx,
/*
* Now process the inode as if it was new.
*/
- sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = true;
- sctx->cur_inode_deleted = false;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_rdev = btrfs_inode_rdev(
- sctx->left_path->nodes[0], left_ii);
- ret = send_create_inode_if_needed(sctx);
- if (ret < 0)
- goto out;
+ if (new_nlinks > 0) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0],
+ left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
- ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
- if (ret < 0)
- goto out;
- /*
- * Advance send_progress now as we did not get into
- * process_recorded_refs_if_needed in the new_gen case.
- */
- sctx->send_progress = sctx->cur_ino + 1;
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get
+ * into process_recorded_refs_if_needed in the
+ * new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
- /*
- * Now process all extents and xattrs of the inode as if
- * they were all new.
- */
- ret = process_all_extents(sctx);
- if (ret < 0)
- goto out;
- ret = process_all_new_xattrs(sctx);
- if (ret < 0)
- goto out;
+ /*
+ * Now process all extents and xattrs of the
+ * inode as if they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ }
} else {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = false;
@@ -6785,18 +6832,27 @@ static int changed_extent(struct send_ctx *sctx,
return ret;
}
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ sctx->cur_inode_needs_verity = true;
+ }
+ return ret;
+}
+
static int dir_changed(struct send_ctx *sctx, u64 dir)
{
u64 orig_gen, new_gen;
int ret;
- ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &new_gen);
if (ret)
return ret;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
if (ret)
return ret;
@@ -6939,6 +6995,9 @@ static int changed_cb(struct btrfs_path *left_path,
ret = changed_xattr(sctx, result);
else if (key->type == BTRFS_EXTENT_DATA_KEY)
ret = changed_extent(sctx, result);
+ else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+ key->offset == 0)
+ ret = changed_verity(sctx, result);
}
out:
@@ -8036,6 +8095,7 @@ out:
kvfree(sctx->clone_roots);
kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
+ kvfree(sctx->verity_descriptor);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4bb4e6a638cb..f7585cfa7e52 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -10,7 +10,12 @@
#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+/* Conditional support for the upcoming protocol version. */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_SEND_STREAM_VERSION 3
+#else
#define BTRFS_SEND_STREAM_VERSION 2
+#endif
/*
* In send stream v1, no command is larger than 64K. In send stream v2, no limit
@@ -92,8 +97,11 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_ENCODED_WRITE = 25,
BTRFS_SEND_C_MAX_V2 = 25,
+ /* Version 3 */
+ BTRFS_SEND_C_ENABLE_VERITY = 26,
+ BTRFS_SEND_C_MAX_V3 = 26,
/* End */
- BTRFS_SEND_C_MAX = 25,
+ BTRFS_SEND_C_MAX = 26,
};
/* attributes in send stream */
@@ -160,8 +168,14 @@ enum {
BTRFS_SEND_A_ENCRYPTION = 31,
BTRFS_SEND_A_MAX_V2 = 31,
- /* End */
- BTRFS_SEND_A_MAX = 31,
+ /* Version 3 */
+ BTRFS_SEND_A_VERITY_ALGORITHM = 32,
+ BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33,
+ BTRFS_SEND_A_VERITY_SALT_DATA = 34,
+ BTRFS_SEND_A_VERITY_SIG_DATA = 35,
+ BTRFS_SEND_A_MAX_V3 = 35,
+
+ __BTRFS_SEND_A_MAX = 35,
};
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 435559ba94fa..f171bf875633 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -293,32 +293,36 @@ out:
return ret;
}
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- bool active, struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
{
struct btrfs_space_info *found;
- int factor;
+ int factor, index;
- factor = btrfs_bg_type_to_factor(flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, flags);
+ found = btrfs_find_space_info(info, block_group->flags);
ASSERT(found);
spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- if (active)
- found->active_total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- found->bytes_zone_unusable += bytes_zone_unusable;
- if (total_bytes > 0)
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
- *space_info = found;
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -472,28 +476,47 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
+ const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
- info->flags,
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
info->bytes_readonly, info->bytes_zone_unusable);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -505,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
if (!dump_block_groups)
@@ -1662,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
@@ -1737,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
- flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
@@ -1749,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
}
return ret;
}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 12fd6147f92d..ce66023a9eb8 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -123,10 +123,8 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- bool active, struct btrfs_space_info **space_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group);
void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -159,4 +157,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 6fc2b77ae5c3..9a176af847d7 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
*
* Even with 0 returned, the page still need extra check to make sure
* it's really the correct page, as the caller is using
- * find_get_pages_contig(), which can race with page invalidating.
+ * filemap_get_folios_contig(), which can race with page invalidating.
*/
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f89beac3c665..9be4fd2db0f4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -346,12 +346,14 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno)
+ unsigned int line, int errno, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
@@ -626,6 +628,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
+ const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
@@ -1137,10 +1140,12 @@ out:
}
if (!ret)
ret = btrfs_check_mountopts_zoned(info);
- if (!ret && btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
+ if (!ret && !remounting) {
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ btrfs_info(info, "using free space tree");
+ }
return ret;
}
@@ -2009,14 +2014,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
- /* V1 cache is not supported for subpage mount. */
- if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
- btrfs_warn(fs_info,
- "v1 space cache is not supported for page size %lu with sectorsize %u",
- PAGE_SIZE, fs_info->sectorsize);
- ret = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0)
goto restore;
- }
+
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
@@ -2550,11 +2551,71 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans);
}
+static int check_dev_super(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_super_block *sb;
+ int ret = 0;
+
+ /* This should be called with fs still frozen. */
+ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+ /* Missing dev, no need to check. */
+ if (!dev->bdev)
+ return 0;
+
+ /* Only need to check the primary super block. */
+ sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Btrfs_validate_super() includes fsid check against super->fsid. */
+ ret = btrfs_validate_super(fs_info, sb, 0);
+ if (ret < 0)
+ goto out;
+
+ if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+ btrfs_super_generation(sb),
+ fs_info->last_trans_committed);
+ ret = -EUCLEAN;
+ goto out;
+ }
+out:
+ btrfs_release_disk_super(sb);
+ return ret;
+}
+
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ int ret = 0;
+ /*
+ * Make sure the fs is not changed by accident (like hibernation then
+ * modified by other OS).
+ * If we found anything wrong, we mark the fs error immediately.
+ *
+ * And since the fs is frozen, no one can modify the fs yet, thus
+ * we don't need to hold device_list_mutex.
+ */
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ ret = check_dev_super(device);
+ if (ret < 0) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "super block on devid %llu got modified unexpectedly",
+ device->devid);
+ break;
+ }
+ }
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+ /*
+ * We still return 0, to allow VFS layer to unfreeze the fs even the
+ * above checks failed. Since the fs is either fine or read-only, we're
+ * safe to continue, without causing further damage.
+ */
return 0;
}
@@ -2662,17 +2723,21 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_compress;
- err = extent_io_init();
+ err = extent_state_init_cachep();
if (err)
goto free_cachep;
- err = extent_state_cache_init();
+ err = extent_buffer_init_cachep();
+ if (err)
+ goto free_extent_cachep;
+
+ err = btrfs_bioset_init();
if (err)
- goto free_extent_io;
+ goto free_eb_cachep;
err = extent_map_init();
if (err)
- goto free_extent_state_cache;
+ goto free_bioset;
err = ordered_data_init();
if (err)
@@ -2724,10 +2789,12 @@ free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
-free_extent_state_cache:
- extent_state_cache_exit();
-free_extent_io:
- extent_io_exit();
+free_bioset:
+ btrfs_bioset_exit();
+free_eb_cachep:
+ extent_buffer_free_cachep();
+free_extent_cachep:
+ extent_state_free_cachep();
free_cachep:
btrfs_destroy_cachep();
free_compress:
@@ -2746,8 +2813,9 @@ static void __exit exit_btrfs_fs(void)
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
- extent_state_cache_exit();
- extent_io_exit();
+ btrfs_bioset_exit();
+ extent_state_free_cachep();
+ extent_buffer_free_cachep();
btrfs_interface_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d5d0717fd09a..699b54b3acaa 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -35,12 +35,12 @@
* qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
* space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
* raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs /sys/fs/btrfs/<uuid>/discard
*
* When built with BTRFS_CONFIG_DEBUG:
*
* btrfs_debug_feature_attrs /sys/fs/btrfs/debug
* btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
- * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
*/
struct btrfs_feature_attr {
@@ -286,6 +286,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
@@ -317,6 +318,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+ BTRFS_FEAT_ATTR_PTR(block_group_tree),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
@@ -429,12 +431,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
.attrs = btrfs_supported_static_feature_attrs,
};
-#ifdef CONFIG_BTRFS_DEBUG
-
/*
* Discard statistics and tunables
*/
-#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj))
static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -583,11 +583,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
/*
- * Per-filesystem debugging of discard (when mounted with discard=async).
+ * Per-filesystem stats for discard (when mounted with discard=async).
*
- * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ * Path: /sys/fs/btrfs/<uuid>/discard/
*/
-static const struct attribute *discard_debug_attrs[] = {
+static const struct attribute *discard_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
@@ -599,6 +599,8 @@ static const struct attribute *discard_debug_attrs[] = {
NULL,
};
+#ifdef CONFIG_BTRFS_DEBUG
+
/*
* Per-filesystem runtime debugging exported via sysfs.
*
@@ -837,11 +839,8 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
- ssize_t ret;
- ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
-
- return ret;
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1150,25 +1149,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
}
BTRFS_ATTR(, generation, btrfs_generation_show);
-/*
- * Look for an exact string @string in @buffer with possible leading or
- * trailing whitespace
- */
-static bool strmatch(const char *buffer, const char *string)
-{
- const size_t len = strlen(string);
-
- /* Skip leading whitespace */
- buffer = skip_spaces(buffer);
-
- /* Match entire string, check if the rest is whitespace or empty */
- if (strncmp(string, buffer, len) == 0 &&
- strlen(skip_spaces(buffer + len)) == 0)
- return true;
-
- return false;
-}
-
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1202,7 +1182,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
if (i != fs_devices->read_policy) {
fs_devices->read_policy = i;
btrfs_info(fs_devices->fs_info,
@@ -1222,11 +1202,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- ssize_t ret;
- ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
-
- return ret;
+ return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
}
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1427,13 +1404,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
-#ifdef CONFIG_BTRFS_DEBUG
- if (fs_info->discard_debug_kobj) {
- sysfs_remove_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
- kobject_del(fs_info->discard_debug_kobj);
- kobject_put(fs_info->discard_debug_kobj);
+ if (fs_info->discard_kobj) {
+ sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+ kobject_del(fs_info->discard_kobj);
+ kobject_put(fs_info->discard_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
@@ -2001,20 +1977,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
if (error)
goto failure;
+#endif
/* Discard directory */
- fs_info->discard_debug_kobj = kobject_create_and_add("discard",
- fs_info->debug_kobj);
- if (!fs_info->discard_debug_kobj) {
+ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+ if (!fs_info->discard_kobj) {
error = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
+ error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
if (error)
goto failure;
-#endif
error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
@@ -2041,6 +2015,98 @@ failure:
return error;
}
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool enabled;
+
+ spin_lock(&fs_info->qgroup_lock);
+ enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool inconsistent;
+
+ spin_lock(&fs_info->qgroup_lock);
+ inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 result;
+
+ spin_lock(&fs_info->qgroup_lock);
+ result = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 new_thres;
+ int ret;
+
+ ret = kstrtou8(buf, 10, &new_thres);
+ if (ret)
+ return -EINVAL;
+
+ if (new_thres > BTRFS_MAX_LEVEL)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->qgroup_drop_subtree_thres = new_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+ qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+ BTRFS_ATTR_PTR(qgroups, enabled),
+ BTRFS_ATTR_PTR(qgroups, inconsistent),
+ BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = qgroups_groups,
+ .release = qgroups_release,
+};
+
static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
{
return to_fs_info(kobj->parent->parent);
@@ -2166,11 +2232,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
if (fs_info->qgroups_kobj)
return 0;
- fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
- if (!fs_info->qgroups_kobj) {
- ret = -ENOMEM;
+ fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!fs_info->qgroups_kobj)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+ fsid_kobj, "qgroups");
+ if (ret < 0)
goto out;
- }
+
rbtree_postorder_for_each_entry_safe(qgroup, next,
&fs_info->qgroup_tree, node) {
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index cc9377cf56a3..9c478fa256f6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -243,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
kfree(cache->free_space_ctl);
kfree(cache);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index a232b15b8021..350da449db08 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -4,6 +4,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sizes.h>
@@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
- struct page *pages[16];
+ struct folio_batch fbatch;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
int loops = 0;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long, nr_pages,
- ARRAY_SIZE(pages)), pages);
+ folio_batch_init(&fbatch);
+
+ while (index <= end_index) {
+ ret = filemap_get_folios_contig(inode->i_mapping, &index,
+ end_index, &fbatch);
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (flags & PROCESS_TEST_LOCKED &&
- !PageLocked(pages[i]))
+ !folio_test_locked(folio))
count++;
- if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (flags & PROCESS_UNLOCK && folio_test_locked(folio))
+ folio_unlock(folio);
if (flags & PROCESS_RELEASE)
- put_page(pages[i]);
+ folio_put(folio);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
cond_resched();
loops++;
if (loops > 100000) {
printk(KERN_ERR
- "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
- start, end, nr_pages, ret);
+ "stuck in a loop, start %llu, end %llu, ret %d\n",
+ start, end, ret);
break;
}
}
+
return count;
}
@@ -80,7 +82,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
- PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
@@ -172,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 5930cdcae5cb..ebf68fcd2149 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache)
}
/* Cleanup */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
* [ bitmap ]
* [ del ]
*/
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_err("couldn't add bitmap %d", ret);
@@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* This blew up before, we have part of the free space in a bitmap and
@@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return ret;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
if (ret)
return ret;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* Now test a similar scenario, but where our extent entry is located
@@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return ret;
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps do the correct thing. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
for (i = 0; i < 2; i++) {
offset = i * BITS_PER_BITMAP * sectorsize;
bytes = (i + 1) * SZ_1M;
@@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps with different ->max_extent_size. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
orig_free_space_ops = cache->free_space_ctl->op;
cache->free_space_ctl->op = &test_free_space_ops;
@@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index cac89c388131..625f7d398368 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -267,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* All of the magic numbers are based on the mapping setup in
@@ -975,7 +975,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1043,7 +1043,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1076,7 +1076,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1092,7 +1092,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0bec10740ad3..d1f1da6820fb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -161,7 +161,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
- struct btrfs_caching_control *caching_ctl, *next;
/*
* At this point no one can be using this transaction to modify any tree
@@ -196,46 +195,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
}
spin_unlock(&cur_trans->dropped_roots_lock);
- /*
- * We have to update the last_byte_to_unpin under the commit_root_sem,
- * at the same time we swap out the commit roots.
- *
- * This is because we must have a real view of the last spot the caching
- * kthreads were while caching. Consider the following views of the
- * extent tree for a block group
- *
- * commit root
- * +----+----+----+----+----+----+----+
- * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * new commit root
- * +----+----+----+----+----+----+----+
- * | | | |\\\\| | |\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * If the cache_ctl->progress was at 3, then we are only allowed to
- * unpin [0,1) and [2,3], because the caching thread has already
- * processed those extents. We are not allowed to unpin [5,6), because
- * the caching thread will re-start it's search from 3, and thus find
- * the hole from [4,6) to add to the free space cache.
- */
- write_lock(&fs_info->block_group_cache_lock);
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- struct btrfs_block_group *cache = caching_ctl->block_group;
-
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
- write_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -313,6 +272,8 @@ loop:
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
return 0;
}
spin_unlock(&fs_info->trans_lock);
@@ -334,16 +295,23 @@ loop:
if (!cur_trans)
return -ENOMEM;
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
spin_lock(&fs_info->trans_lock);
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
goto loop;
} else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
return -EROFS;
}
@@ -397,7 +365,7 @@ loop:
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ IO_TREE_TRANS_DIRTY_PAGES, NULL);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
@@ -541,6 +509,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
@@ -625,7 +594,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- delayed_refs_rsv->full == 0) {
+ btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
@@ -650,7 +619,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
- !delayed_refs_rsv->full) {
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
/*
* Some people call with btrfs_start_transaction(root, 0)
* because they can be throttled, but have some other mechanism
@@ -859,6 +828,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit,
u64 transid = commit->transid;
bool put = false;
+ /*
+ * At the moment this function is called with min_state either being
+ * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+ */
+ if (min_state == TRANS_STATE_COMPLETED)
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ else
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
while (1) {
wait_event(commit->commit_wait, commit->state >= min_state);
if (put)
@@ -1022,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait);
+
+ btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -1134,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, 0, 0, &cached_state);
+ EXTENT_NEED_WAIT, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1912,14 +1894,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
-
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- root_item = &fs_info->block_group_root->root_item;
-
- super->block_group_root = root_item->bytenr;
- super->block_group_root_generation = root_item->generation;
- super->block_group_root_level = root_item->level;
- }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1967,6 +1941,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
@@ -1994,6 +1969,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has already released the lockdep map as reader
+ * already in btrfs_commit_transaction().
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -2118,12 +2099,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- btrfs_end_transaction(trans);
- return ret;
+ goto lockdep_trans_commit_start_release;
}
btrfs_trans_release_metadata(trans);
@@ -2140,10 +2121,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Any running threads may add more while we are here.
*/
ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
btrfs_create_pending_block_groups(trans);
@@ -2172,10 +2151,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
}
@@ -2190,6 +2167,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ btrfs_trans_state_lockdep_release(fs_info,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
@@ -2203,6 +2183,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2222,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_put_transaction(prev_trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
} else {
spin_unlock(&fs_info->trans_lock);
}
@@ -2236,7 +2217,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
- goto cleanup_transaction;
+ goto lockdep_release;
}
}
@@ -2250,19 +2231,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
ret = btrfs_run_delayed_items(trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
wait_event(cur_trans->writer_wait,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (ret) {
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
goto cleanup_transaction;
+ }
btrfs_wait_delalloc_flush(fs_info);
@@ -2271,6 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* transaction. Otherwise if this transaction commits before the ordered
* extents complete we lose logged data after a power failure.
*/
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
wait_event(cur_trans->pending_wait,
atomic_read(&cur_trans->pending_ordered) == 0);
@@ -2284,10 +2275,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
/*
+ * Make lockdep happy by acquiring the state locks after
+ * btrfs_trans_num_writers is released. If we acquired the state locks
+ * before releasing the btrfs_trans_num_writers lock then lockdep would
+ * complain because we did not follow the reverse order unlocking rule.
+ */
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ /*
* We've started the commit, clear the flag in case we were triggered to
* do an async commit but somebody else started before the transaction
* kthread could do the work.
@@ -2296,6 +2305,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
goto scrub_continue;
}
/*
@@ -2430,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->reloc_mutex);
wake_up(&fs_info->transaction_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
@@ -2461,6 +2472,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
btrfs_finish_extent_commit(trans);
@@ -2474,6 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
@@ -2502,7 +2515,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
scrub_continue:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
@@ -2515,6 +2531,16 @@ cleanup_transaction:
cleanup_transaction(trans, ret);
return ret;
+
+lockdep_release:
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_end_transaction(trans);
+ return ret;
}
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9205c4a5ca81..813986e38258 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -22,6 +22,8 @@
#include "zoned.h"
#include "inode-item.h"
+#define MAX_CONFLICT_INODES 10
+
/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
@@ -31,8 +33,6 @@
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
@@ -801,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
/*
@@ -1063,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+ u64 ref_index, char *name, int namelen)
{
int ret;
char *victim_name;
@@ -1126,19 +1125,12 @@ again:
kfree(victim_name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
-
- /*
- * NOTE: we have searched root tree and checked the
- * corresponding ref, it does not need to check again.
- */
- *search_done = 1;
}
btrfs_release_path(path);
@@ -1202,14 +1194,12 @@ again:
kfree(victim_name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
next:
cur_offset += victim_name_len + sizeof(*extref);
}
- *search_done = 1;
}
btrfs_release_path(path);
@@ -1373,103 +1363,6 @@ again:
return ret;
}
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
- const u8 ref_type, const char *name,
- const int namelen)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- const u64 parent_id = btrfs_ino(BTRFS_I(dir));
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = ref_type;
- if (key.type == BTRFS_INODE_REF_KEY)
- key.offset = parent_id;
- else
- key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
- ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id, name, namelen);
- else
- ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen);
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans,
- struct inode *dir, struct inode *inode, const char *name,
- int namelen, u64 ref_index)
-{
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_dir_item *dir_item;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct inode *other_inode = NULL;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_item = btrfs_lookup_dir_item(NULL, root, path,
- btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
- if (!dir_item) {
- btrfs_release_path(path);
- goto add_link;
- } else if (IS_ERR(dir_item)) {
- ret = PTR_ERR(dir_item);
- goto out;
- }
-
- /*
- * Our inode's dentry collides with the dentry of another inode which is
- * in the log but not yet processed since it has a higher inode number.
- * So delete that other dentry.
- */
- btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
- btrfs_release_path(path);
- other_inode = read_one_inode(root, key.objectid);
- if (!other_inode) {
- ret = -ENOENT;
- goto out;
- }
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
- if (ret)
- goto out;
- /*
- * If we dropped the link count to 0, bump it so that later the iput()
- * on the inode will not free it. We will fixup the link count later.
- */
- if (other_inode->i_nlink == 0)
- set_nlink(other_inode, 1);
-add_link:
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
-out:
- iput(other_inode);
- btrfs_free_path(path);
-
- return ret;
-}
-
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1490,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
char *name = NULL;
int namelen;
int ret;
- int search_done = 0;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
@@ -1565,51 +1457,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir),
- BTRFS_I(inode),
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out;
- }
- }
-
- /*
- * If a reference item already exists for this inode
- * with the same parent and name, but different index,
- * drop it and the corresponding directory index entries
- * from the parent before adding the new reference item
- * and dir index entries, otherwise we would fail with
- * -EEXIST returned from btrfs_add_link() below.
- */
- ret = btrfs_inode_ref_exists(inode, dir, key->type,
- name, namelen);
- if (ret > 0) {
- ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen);
- /*
- * If we dropped the link count to 0, bump it so
- * that later the iput() on the inode will not
- * free it. We will fixup the link count later.
- */
- if (!ret && inode->i_nlink == 0)
- set_nlink(inode, 1);
- }
- if (ret < 0)
+ ret = __add_inode_ref(trans, root, path, log,
+ BTRFS_I(dir), BTRFS_I(inode),
+ inode_objectid, parent_objectid,
+ ref_index, name, namelen);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
+ }
/* insert our name */
- ret = add_link(trans, dir, inode, name, namelen,
- ref_index);
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
if (ret)
goto out;
@@ -3875,6 +3735,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
*last_old_dentry_offset = key.offset;
continue;
}
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
/*
* We must make sure that when we log a directory entry, the
* corresponding inode, after log replay, has a matching link
@@ -3905,51 +3770,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ctx->log_new_dentries = true;
}
- if (!ctx->logged_before)
- goto add_to_batch;
-
- /*
- * If we were logged before and have logged dir items, we can skip
- * checking if any item with a key offset larger than the last one
- * we logged is in the log tree, saving time and avoiding adding
- * contention on the log tree. We can only rely on the value of
- * last_dir_index_offset when we know for sure that the inode was
- * previously logged in the current transaction.
- */
- if (key.offset > inode->last_dir_index_offset)
- goto add_to_batch;
- /*
- * Check if the key was already logged before. If not we can add
- * it to a batch for bulk insertion.
- */
- ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- btrfs_release_path(dst_path);
- goto add_to_batch;
- }
-
- /*
- * Item exists in the log. Overwrite the item in the log if it
- * has different content or do nothing if it has exactly the same
- * content. And then flush the current batch if any - do it after
- * overwriting the current item, or we would deadlock otherwise,
- * since we are holding a path for the existing item.
- */
- ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
- if (ret < 0)
- return ret;
-
- if (batch_size > 0) {
- ret = flush_dir_items_batch(trans, log, src, dst_path,
- batch_start, batch_size);
- if (ret < 0)
- return ret;
- batch_size = 0;
- }
- continue;
-add_to_batch:
if (batch_size == 0)
batch_start = i;
batch_size++;
@@ -4136,6 +3956,71 @@ done:
}
/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != (u64)-1)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
* logging directories is very similar to logging inodes, We find all the items
* from the current transaction and write them to the log.
*
@@ -4157,6 +4042,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 max_key;
int ret;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4382,8 +4271,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
- lock_end, &cached_state);
+ ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4399,8 +4288,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
- &cached_state);
+ unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
return ret;
}
@@ -4513,7 +4402,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr += extent_offset;
ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
@@ -4709,7 +4598,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
+ csum_len - 1, &ordered_sums, 0, false);
if (ret)
return ret;
@@ -5221,10 +5110,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
* leafs from the log root.
*/
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0,
- 0, hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
if (ret < 0)
return ret;
@@ -5253,10 +5141,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0, 0,
- hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
if (ret < 0)
return ret;
}
@@ -5399,111 +5285,461 @@ out:
return ret;
}
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ const struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (true) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ bool continue_curr_inode = true;
+ int nritems;
+ int i;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+ log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ if (continue_curr_inode && min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+
+next:
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = 0;
+ path->skip_locking = 0;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
+ return 0;
+}
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- btrfs_release_path(path);
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- inode = btrfs_iget(fs_info->sb, parent, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
- *
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
- *
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists (see
- * btrfs_log_inode()).
- */
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
btrfs_add_delayed_iput(inode);
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -5511,67 +5747,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
- }
-
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
- if (ret < 0)
- break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
- }
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
}
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
return ret;
}
@@ -5582,7 +5767,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
@@ -5621,8 +5805,8 @@ again:
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5646,11 +5830,12 @@ again:
return ret;
ins_nr = 0;
- ret = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
if (ret)
return ret;
- btrfs_release_path(path);
goto next_key;
}
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
@@ -5733,6 +5918,371 @@ next_key:
return ret;
}
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc(max_batch_size * sizeof(u32) +
+ max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -5764,9 +6314,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
bool inode_item_dropped = true;
- const bool orig_logged_before = ctx->logged_before;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -5794,27 +6345,46 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
/*
- * Only run delayed items if we are a directory. We want to make sure
- * all directory indexes hit the fs/subvolume tree so we can find them
- * and figure out which index ranges have to be logged.
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
if (ret)
goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
- }
+ mutex_lock(&inode->log_mutex);
/*
* For symlinks, we must always log their content, which is stored in an
@@ -5846,9 +6416,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* to known the file was moved from A to B, so logging just A would
* result in losing the file after a log replay.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) &&
- inode_only == LOG_INODE_ALL &&
- inode->last_unlink_trans >= trans->transid) {
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
@@ -5859,14 +6427,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
if (ctx->logged_before)
ret = drop_inode_items(trans, log, path, inode,
- max_key_type);
+ BTRFS_XATTR_ITEM_KEY);
} else {
if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
@@ -5922,9 +6486,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out_unlock;
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only, ctx,
+ inode_only, ctx,
&need_log_inode_item);
if (ret)
goto out_unlock;
@@ -5977,10 +6551,18 @@ log_extents:
write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging) {
ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
+ goto out_unlock;
}
spin_lock(&inode->lock);
@@ -6033,208 +6615,20 @@ out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- if (recursive_logging)
- ctx->logged_before = orig_logged_before;
-
- return ret;
-}
-
-/*
- * Check if we need to log an inode. This is used in contexts where while
- * logging an inode we need to log another inode (either that it exists or in
- * full mode). This is used instead of btrfs_inode_in_log() because the later
- * requires the inode to be in the log and have the log transaction committed,
- * while here we do not care if the log transaction was already committed - our
- * caller will commit the log later - and we want to avoid logging an inode
- * multiple times when multiple tasks have joined the same log transaction.
- */
-static bool need_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- /*
- * If a directory was not modified, no dentries added or removed, we can
- * and should avoid logging it.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
- return false;
-
- /*
- * If this inode does not have new/updated/deleted xattrs since the last
- * time it was logged and is flagged as logged in the current transaction,
- * we can skip logging it. As for new/deleted names, those are updated in
- * the log by link/unlink/rename operations.
- * In case the inode was logged and then evicted and reloaded, its
- * logged_trans will be 0, in which case we have to fully log it since
- * logged_trans is a transient field, not persisted.
- */
- if (inode->logged_trans == trans->transid &&
- !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
- return false;
-
- return true;
-}
-
-struct btrfs_dir_list {
- u64 ino;
- struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- * CPU0 CPU1
- * ---- ----
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(sb_internal#2);
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- * that while logging the inode new references (names) are added or removed
- * from the inode, leaving the logged inode item with a link count that does
- * not match the number of logged inode reference items. This is fine because
- * at log replay time we compute the real number of links and correct the
- * link count in the inode item (see replay_one_buffer() and
- * link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new index items (key type
- * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
- * has a size that doesn't match the sum of the lengths of all the logged
- * names - this is ok, not a problem, because at log replay time we set the
- * directory's i_size to the correct value (see replay_one_name() and
- * do_overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *start_inode,
- struct btrfs_log_ctx *ctx)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- LIST_HEAD(dir_list);
- struct btrfs_dir_list *dir_elem;
- int ret = 0;
-
- /*
- * If we are logging a new name, as part of a link or rename operation,
- * don't bother logging new dentries, as we just want to log the names
- * of an inode and that any new parents exist.
- */
- if (ctx->logging_new_name)
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
- if (!dir_elem) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
- dir_elem->ino = btrfs_ino(start_inode);
- list_add_tail(&dir_elem->list, &dir_list);
-
- while (!list_empty(&dir_list)) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
- int nritems;
- int i;
-
- dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
- list);
- if (ret)
- goto next_dir_inode;
-
- min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_INDEX_KEY;
- min_key.offset = 0;
-again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(root, &min_key, path, trans->transid);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
- struct btrfs_key di_key;
- struct inode *di_inode;
- struct btrfs_dir_list *new_dir_elem;
- int log_mode = LOG_INODE_EXISTS;
- int type;
-
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_INDEX_KEY)
- goto next_dir_inode;
-
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid)
- continue;
- btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
- if (di_key.type == BTRFS_ROOT_ITEM_KEY)
- continue;
-
- btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
- if (IS_ERR(di_inode)) {
- ret = PTR_ERR(di_inode);
- goto next_dir_inode;
- }
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(di_inode);
- break;
- }
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
- ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR)
- log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(di_inode);
- if (ret)
- goto next_dir_inode;
- if (ctx->log_new_dentries) {
- new_dir_elem = kmalloc(sizeof(*new_dir_elem),
- GFP_NOFS);
- if (!new_dir_elem) {
- ret = -ENOMEM;
- goto next_dir_inode;
- }
- new_dir_elem->ino = di_key.objectid;
- list_add_tail(&new_dir_elem->list, &dir_list);
- }
- break;
- }
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- goto again;
- }
-next_dir_inode:
- list_del(&dir_elem->list);
- kfree(dir_elem);
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
}
- btrfs_free_path(path);
return ret;
}
@@ -6346,7 +6740,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, root,
+ ret = log_new_dir_dentries(trans,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
@@ -6661,7 +7055,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, inode, ctx);
+ ret = log_new_dir_dentries(trans, inode, ctx);
else
ret = 0;
end_trans:
@@ -7088,6 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
* If an error happened mark the log for a full commit because it's not
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 57ab5f3b8dc7..aed1e05e9879 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -20,6 +20,7 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
/* Tracks the last logged dir item/index key offset. */
@@ -28,6 +29,9 @@ struct btrfs_log_ctx {
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
+ struct list_head conflict_inodes;
+ int num_conflict_inodes;
+ bool logging_conflict_inodes;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -37,10 +41,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
}
static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 90eb5c2830a9..ee00e33c309e 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -659,8 +659,7 @@ rollback:
*
* Returns the size on success or a negative error code on failure.
*/
-static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
- size_t buf_size)
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
{
u64 true_size;
int ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f63ff91e2883..94ba46d57920 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -34,6 +34,8 @@
#include "discard.h"
#include "zoned.h"
+static struct bio_set btrfs_bioset;
+
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -247,10 +249,10 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
+ enum btrfs_map_op op, u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map);
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map);
/*
* Device locking
@@ -2017,7 +2019,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct page *page;
int ret;
- disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
continue;
@@ -5595,7 +5597,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- bg->chunk_item_inserted = 1;
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
@@ -5896,7 +5898,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bioc->error, 0);
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
@@ -6092,7 +6093,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bioc, 0, 0);
+ logical, &length, &bioc, NULL, NULL, 0);
if (ret) {
ASSERT(bioc == NULL);
return ret;
@@ -6153,9 +6154,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
cache = btrfs_lookup_block_group(fs_info, logical);
- spin_lock(&cache->lock);
- ret = cache->to_copy;
- spin_unlock(&cache->lock);
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
@@ -6351,11 +6350,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
return 0;
}
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+ u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
+ enum btrfs_map_op op, u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map)
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6366,6 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int data_stripes;
int i;
int ret = 0;
+ int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
@@ -6526,6 +6534,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
+ /*
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
+ */
+ if (smap && num_alloc_stripes == 1 &&
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ !dev_replace->tgtdev)) {
+ if (patch_the_first_stripe_for_dev_replace) {
+ smap->dev = dev_replace->tgtdev;
+ smap->physical = physical_to_patch_in_first_stripe;
+ *mirror_num_ret = map->num_stripes + 1;
+ } else {
+ set_io_stripe(smap, map, stripe_index, stripe_offset,
+ stripe_nr);
+ *mirror_num_ret = mirror_num;
+ }
+ *bioc_ret = NULL;
+ ret = 0;
+ goto out;
+ }
+
bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
if (!bioc) {
ret = -ENOMEM;
@@ -6533,9 +6564,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical = map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+ stripe_nr);
stripe_index++;
}
@@ -6603,7 +6633,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- mirror_num, 0);
+ NULL, &mirror_num, 0);
}
/* For Scrub/replace */
@@ -6611,14 +6641,77 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, NULL, 1);
}
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
{
- if (bioc->orig_bio->bi_opf & REQ_META)
- return bioc->fs_info->endio_meta_workers;
- return bioc->fs_info->endio_workers;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+ struct btrfs_bio *bbio;
+
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
}
static void btrfs_end_bio_work(struct work_struct *work)
@@ -6626,103 +6719,101 @@ static void btrfs_end_bio_work(struct work_struct *work)
struct btrfs_bio *bbio =
container_of(work, struct btrfs_bio, end_io_work);
- bio_endio(&bbio->bio);
+ bbio->end_io(bbio);
}
-static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
+static void btrfs_simple_end_io(struct bio *bio)
{
- struct bio *orig_bio = bioc->orig_bio;
- struct btrfs_bio *bbio = btrfs_bio(orig_bio);
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
- bbio->mirror_num = bioc->mirror_num;
- orig_bio->bi_private = bioc->private;
- orig_bio->bi_end_io = bioc->end_io;
+ btrfs_bio_counter_dec(fs_info);
- /*
- * Only send an error to the higher layers if it is beyond the tolerance
- * threshold.
- */
- if (atomic_read(&bioc->error) > bioc->max_errors)
- orig_bio->bi_status = BLK_STS_IOERR;
- else
- orig_bio->bi_status = BLK_STS_OK;
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
- if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+ if (bio_op(bio) == REQ_OP_READ) {
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- bio_endio(orig_bio);
+ bbio->end_io(bbio);
}
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+ struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
btrfs_put_bioc(bioc);
}
-static void btrfs_end_bio(struct bio *bio)
+static void btrfs_orig_write_end_io(struct bio *bio)
{
struct btrfs_io_stripe *stripe = bio->bi_private;
struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
if (bio->bi_status) {
atomic_inc(&bioc->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
+ btrfs_log_dev_io_error(bio, stripe->dev);
}
- if (bio != bioc->orig_bio)
- bio_put(bio);
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
- btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc, true);
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
}
-static void submit_stripe_bio(struct btrfs_io_context *bioc,
- struct bio *orig_bio, int dev_nr, bool clone)
+static void btrfs_clone_write_end_io(struct bio *bio)
{
- struct btrfs_fs_info *fs_info = bioc->fs_info;
- struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
- u64 physical = bioc->stripes[dev_nr].physical;
- struct bio *bio;
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
if (!dev || !dev->bdev ||
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
- (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc, false);
+ bio_io_error(bio);
return;
}
- if (clone) {
- bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
- } else {
- bio = orig_bio;
- bio_set_dev(bio, dev->bdev);
- btrfs_bio(bio)->device = dev;
- }
+ bio_set_dev(bio, dev->bdev);
- bioc->stripes[dev_nr].bioc = bioc;
- bio->bi_private = &bioc->stripes[dev_nr];
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_iter.bi_sector = physical >> 9;
/*
* For zone append writing, bi_sector must point the beginning of the
* zone
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical, fs_info->zone_size);
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
} else {
@@ -6730,50 +6821,53 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
bio->bi_opf |= REQ_OP_WRITE;
}
}
- btrfs_debug_in_rcu(fs_info,
+ btrfs_debug_in_rcu(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
- btrfs_bio_counter_inc_noblocked(fs_info);
-
btrfsic_check_bio(bio);
submit_bio(bio);
}
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+ struct bio *orig_bio = bioc->orig_bio, *bio;
+
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
+ }
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
- int ret;
- int dev_nr;
- int total_devs;
struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
+ int ret;
btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bioc, mirror_num, 1);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- return;
- }
-
- total_devs = bioc->num_stripes;
- bioc->orig_bio = bio;
- bioc->private = bio->bi_private;
- bioc->end_io = bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, total_devs);
-
- if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- raid56_parity_write(bio, bioc);
- else
- raid56_parity_recover(bio, bioc, mirror_num, true);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
return;
}
@@ -6784,12 +6878,31 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
BUG();
}
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- const bool should_clone = (dev_nr < total_devs - 1);
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
+ else
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
- submit_stripe_bio(bioc, bio, dev_nr, should_clone);
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
}
- btrfs_bio_counter_dec(fs_info);
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -8244,7 +8357,7 @@ static int relocating_repair_kthread(void *data)
if (!cache)
goto out;
- if (!cache->relocating_repair)
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
goto out;
ret = btrfs_may_alloc_data_chunk(fs_info, target);
@@ -8281,17 +8394,27 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
if (!cache)
return true;
- spin_lock(&cache->lock);
- if (cache->relocating_repair) {
- spin_unlock(&cache->lock);
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
btrfs_put_block_group(cache);
return true;
}
- cache->relocating_repair = 1;
- spin_unlock(&cache->lock);
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
return true;
}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5639961b3626..599b9d5af349 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,6 +181,31 @@ struct btrfs_device {
};
/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
+ */
+ bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
+};
+
+/*
* If we read those variants at the context of their own lock, we needn't
* use the following helpers, reading them directly is safe.
*/
@@ -361,6 +386,8 @@ struct btrfs_fs_devices {
*/
#define BTRFS_MAX_BIO_SECTORS (256)
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
/*
* Additional info to pass along bio.
*
@@ -378,6 +405,10 @@ struct btrfs_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
/* For read end I/O handling */
struct work_struct end_io_work;
@@ -393,6 +424,20 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
return container_of(bio, struct btrfs_bio, bio);
}
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
+}
+
static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
if (bbio->csum != bbio->csum_inline) {
@@ -451,12 +496,9 @@ struct btrfs_discard_stripe {
*/
struct btrfs_io_context {
refcount_t refs;
- atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
- bio_end_io_t *end_io;
struct bio *orig_bio;
- void *private;
atomic_t error;
int max_errors;
int num_stripes;
@@ -714,4 +756,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
#endif
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 73c6929f7be6..e2d073b08a7d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -652,80 +652,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
return 0;
}
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ if (device->bdev &&
+ bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found: %pg",
+ device->bdev);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- u64 zoned_devices = 0;
- u64 nr_devices = 0;
u64 zone_size = 0;
u64 max_zone_append_size = 0;
- const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
- int ret = 0;
+ int ret;
- /* Count zoned devices */
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- enum blk_zoned_model model;
+ /*
+ * Host-Managed devices can't be used without the ZONED flag. With the
+ * ZONED all devices can be used, using zone emulation if required.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return btrfs_check_for_zoned_device(fs_info);
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
if (!device->bdev)
continue;
- model = bdev_zoned_model(device->bdev);
- /*
- * A Host-Managed zoned device must be used as a zoned device.
- * A Host-Aware zoned device and a non-zoned devices can be
- * treated as a zoned device, if ZONED flag is enabled in the
- * superblock.
- */
- if (model == BLK_ZONED_HM ||
- (model == BLK_ZONED_HA && incompat_zoned) ||
- (model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info;
-
- zone_info = device->zone_info;
- zoned_devices++;
- if (!zone_size) {
- zone_size = zone_info->zone_size;
- } else if (zone_info->zone_size != zone_size) {
- btrfs_err(fs_info,
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
"zoned: unequal block device zone sizes: have %llu found %llu",
- device->zone_info->zone_size,
- zone_size);
- ret = -EINVAL;
- goto out;
- }
- if (!max_zone_append_size ||
- (zone_info->max_zone_append_size &&
- zone_info->max_zone_append_size < max_zone_append_size))
- max_zone_append_size =
- zone_info->max_zone_append_size;
+ zone_info->zone_size, zone_size);
+ return -EINVAL;
}
- nr_devices++;
- }
-
- if (!zoned_devices && !incompat_zoned)
- goto out;
-
- if (!zoned_devices && incompat_zoned) {
- /* No zoned block device found on ZONED filesystem */
- btrfs_err(fs_info,
- "zoned: no zoned devices found on a zoned filesystem");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices && !incompat_zoned) {
- btrfs_err(fs_info,
- "zoned: mode not enabled but zoned device found");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices != nr_devices) {
- btrfs_err(fs_info,
- "zoned: cannot mix zoned and regular devices");
- ret = -EINVAL;
- goto out;
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size = zone_info->max_zone_append_size;
}
/*
@@ -737,14 +712,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info,
"zoned: zone size %llu not aligned to stripe %u",
zone_size, BTRFS_STRIPE_LEN);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
btrfs_err(fs_info, "zoned: mixed block groups not supported");
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
fs_info->zone_size = zone_size;
@@ -760,11 +733,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_check_mountopts_zoned(fs_info);
if (ret)
- goto out;
+ return ret;
btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
-out:
- return ret;
+ return 0;
}
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
@@ -1436,7 +1408,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
- cache->zone_is_active = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
}
@@ -1452,7 +1424,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = caps[0];
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
break;
case BTRFS_BLOCK_GROUP_DUP:
if (map->type & BTRFS_BLOCK_GROUP_DATA) {
@@ -1486,7 +1459,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
} else {
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = min(caps[0], caps[1]);
@@ -1530,7 +1505,7 @@ out:
if (!ret) {
cache->meta_write_pointer = cache->alloc_offset + cache->start;
- if (cache->zone_is_active) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
btrfs_get_block_group(cache);
spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&cache->active_bg_list,
@@ -1563,7 +1538,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
@@ -1871,7 +1845,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->zone_is_active) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
goto out_unlock;
}
@@ -1897,7 +1871,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
}
/* Successfully activated all the zones */
- block_group->zone_is_active = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
space_info->active_total_bytes += block_group->length;
spin_unlock(&block_group->lock);
btrfs_try_granting_tickets(fs_info, space_info);
@@ -1960,7 +1934,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
int i;
spin_lock(&block_group->lock);
- if (!block_group->zone_is_active) {
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -2001,7 +1975,8 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
* Bail out if someone already deactivated the block group, or
* allocated space is left in the block group.
*/
- if (!block_group->zone_is_active) {
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return 0;
@@ -2014,7 +1989,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
}
}
- block_group->zone_is_active = 0;
+ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
@@ -2222,13 +2197,14 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
spin_lock(&block_group->lock);
- if (!block_group->zoned_data_reloc_ongoing)
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
goto out;
/* All relocation extents are written. */
if (block_group->start + block_group->alloc_offset == logical + length) {
/* Now, release this block group for further allocations. */
- block_group->zoned_data_reloc_ongoing = 0;
+ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags);
}
out:
@@ -2300,7 +2276,9 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
list) {
if (!spin_trylock(&bg->lock))
continue;
- if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) {
+ if (btrfs_zoned_bg_is_full(bg) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &bg->runtime_flags)) {
spin_unlock(&bg->lock);
continue;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 55e762a58eb6..d9c6d1fbb6dd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,8 +52,8 @@
#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc);
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
/*
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
- * unlock the buffer. This is what ll_rw_block uses too.
+ * unlock the buffer.
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
@@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode)
* all already-submitted IO to complete, but does not queue any new
* writes to the disk.
*
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
+ * as you dirty the buffers, and then use osync_inode_buffers to wait for
* completion. Any other dirty buffers which are not yet queued for
* write will not be flushed to disk by the osync.
*/
@@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev,
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
put_bh(bh);
}
}
@@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead);
-void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
- gfp_t gfp)
-{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
- if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
- brelse(bh);
- }
-}
-EXPORT_SYMBOL(__breadahead_gfp);
-
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
@@ -1464,19 +1453,15 @@ EXPORT_SYMBOL(set_bh_page);
static void discard_buffer(struct buffer_head * bh)
{
- unsigned long b_state, b_state_old;
+ unsigned long b_state;
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
- b_state = bh->b_state;
- for (;;) {
- b_state_old = cmpxchg(&bh->b_state, b_state,
- (b_state & ~BUFFER_FLAGS_DISCARD));
- if (b_state_old == b_state)
- break;
- b_state = b_state_old;
- }
+ b_state = READ_ONCE(bh->b_state);
+ do {
+ } while (!try_cmpxchg(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1817,7 +1802,7 @@ done:
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
- * ll_rw_block/submit_bh. A rare case.
+ * write_dirty_buffer/submit_bh. A rare case.
*/
end_page_writeback(page);
@@ -2033,7 +2018,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
}
@@ -2352,7 +2337,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
err = inode_newsize_ok(inode, size);
@@ -2378,7 +2363,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
const struct address_space_operations *aops = mapping->a_ops;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
pgoff_t index, curidx;
loff_t curpos;
unsigned zerofrom, offset, len;
@@ -2593,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping,
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
+ err = bh_read(bh, 0);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err < 0)
goto unlock;
}
@@ -2673,8 +2656,8 @@ static void end_bio_bh_io_sync(struct bio *bio)
bio_put(bio);
}
-static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc)
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
struct bio *bio;
@@ -2717,70 +2700,14 @@ static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
}
submit_bio(bio);
- return 0;
}
-int submit_bh(blk_opf_t opf, struct buffer_head *bh)
+void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- return submit_bh_wbc(opf, bh, NULL);
+ submit_bh_wbc(opf, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
-/**
- * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @opf: block layer request operation and flags.
- * @nr: number of &struct buffer_heads in the array
- * @bhs: array of pointers to &struct buffer_head
- *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
- * @opf contains flags modifying the detailed I/O behavior, most notably
- * %REQ_RAHEAD.
- *
- * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * request, and any buffer that appears to be up-to-date when doing read
- * request. Further it marks as clean buffers that are processed for
- * writing (the buffer cache won't assume that they are actually clean
- * until the buffer gets unlocked).
- *
- * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
- * any waiters.
- *
- * All of the buffers must be for the same device, and must also be a
- * multiple of the current approved size for the device.
- */
-void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[])
-{
- const enum req_op op = opf & REQ_OP_MASK;
- int i;
-
- for (i = 0; i < nr; i++) {
- struct buffer_head *bh = bhs[i];
-
- if (!trylock_buffer(bh))
- continue;
- if (op == REQ_OP_WRITE) {
- if (test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- } else {
- if (!buffer_uptodate(bh)) {
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- }
- unlock_buffer(bh);
- }
-}
-EXPORT_SYMBOL(ll_rw_block);
-
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
lock_buffer(bh);
@@ -2801,8 +2728,6 @@ EXPORT_SYMBOL(write_dirty_buffer);
*/
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
- int ret = 0;
-
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
@@ -2817,14 +2742,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE | op_flags, bh);
+ submit_bh(REQ_OP_WRITE | op_flags, bh);
wait_on_buffer(bh);
- if (!ret && !buffer_uptodate(bh))
- ret = -EIO;
+ if (!buffer_uptodate(bh))
+ return -EIO;
} else {
unlock_buffer(bh);
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);
@@ -3029,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh)
EXPORT_SYMBOL(bh_uptodate_or_lock);
/**
- * bh_submit_read - Submit a locked buffer for reading
+ * __bh_read - Submit read for a locked buffer
* @bh: struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @wait: wait until reading finish
*
- * Returns zero on success and -EIO on error.
+ * Returns zero on success or don't wait, and -EIO on error.
*/
-int bh_submit_read(struct buffer_head *bh)
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- BUG_ON(!buffer_locked(bh));
+ int ret = 0;
- if (buffer_uptodate(bh)) {
- unlock_buffer(bh);
- return 0;
- }
+ BUG_ON(!buffer_locked(bh));
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ if (wait) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ ret = -EIO;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__bh_read);
+
+/**
+ * __bh_read_batch - Submit read for a batch of unlocked buffers
+ * @nr: entry number of the buffer batch
+ * @bhs: a batch of struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @force_lock: force to get a lock on the buffer if set, otherwise drops any
+ * buffer that cannot lock.
+ *
+ * Returns zero on success or don't wait, and -EIO on error.
+ */
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ struct buffer_head *bh = bhs[i];
+
+ if (buffer_uptodate(bh))
+ continue;
+
+ if (force_lock)
+ lock_buffer(bh);
+ else
+ if (!trylock_buffer(bh))
+ continue;
+
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ }
}
-EXPORT_SYMBOL(bh_submit_read);
+EXPORT_SYMBOL(__bh_read_batch);
void __init buffer_init(void)
{
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index facf2ebe464b..03ca8f2f657a 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -15,9 +15,8 @@
* file or directory. The caller must hold the inode lock.
*/
static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
bool can_use = false;
if (!(inode->i_flags & S_KERNEL_FILE)) {
@@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
can_use = true;
} else {
trace_cachefiles_mark_failed(object, inode);
- pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
- dentry, inode->i_ino);
}
return can_use;
}
static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
bool can_use;
inode_lock(inode);
- can_use = __cachefiles_mark_inode_in_use(object, dentry);
+ can_use = __cachefiles_mark_inode_in_use(object, inode);
inode_unlock(inode);
return can_use;
}
@@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
* Unmark a backing inode. The caller must hold the inode lock.
*/
static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
-
inode->i_flags &= ~S_KERNEL_FILE;
trace_cachefiles_mark_inactive(object, inode);
}
static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
-
inode_lock(inode);
- __cachefiles_unmark_inode_in_use(object, dentry);
+ __cachefiles_unmark_inode_in_use(object, inode);
inode_unlock(inode);
}
@@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
struct cachefiles_cache *cache = object->volume->cache;
struct inode *inode = file_inode(file);
- if (inode) {
- cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry);
+ cachefiles_do_unmark_inode_in_use(object, inode);
- if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
- atomic_long_add(inode->i_blocks, &cache->b_released);
- if (atomic_inc_return(&cache->f_released))
- cachefiles_state_changed(cache);
- }
+ if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+ atomic_long_add(inode->i_blocks, &cache->b_released);
+ if (atomic_inc_return(&cache->f_released))
+ cachefiles_state_changed(cache);
}
}
@@ -164,8 +154,11 @@ retry:
inode_lock(d_inode(subdir));
inode_unlock(d_inode(dir));
- if (!__cachefiles_mark_inode_in_use(NULL, subdir))
+ if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
+ pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+ subdir, d_inode(subdir)->i_ino);
goto mark_error;
+ }
inode_unlock(d_inode(subdir));
@@ -224,9 +217,7 @@ nomem_d_alloc:
void cachefiles_put_directory(struct dentry *dir)
{
if (dir) {
- inode_lock(dir->d_inode);
- __cachefiles_unmark_inode_in_use(NULL, dir);
- inode_unlock(dir->d_inode);
+ cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir));
dput(dir);
}
}
@@ -410,7 +401,7 @@ try_again:
"Rename failed with error %d", ret);
}
- __cachefiles_unmark_inode_in_use(object, rep);
+ __cachefiles_unmark_inode_in_use(object, d_inode(rep));
unlock_rename(cache->graveyard, dir);
dput(grave);
_leave(" = 0");
@@ -451,84 +442,72 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
const struct cred *saved_cred;
struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
struct file *file;
- struct path path;
+ const struct path parentpath = { .mnt = cache->mnt, .dentry = fan };
uint64_t ni_size;
long ret;
cachefiles_begin_secure(cache, &saved_cred);
- path.mnt = cache->mnt;
ret = cachefiles_inject_write_error();
- if (ret == 0)
- path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR);
- else
- path.dentry = ERR_PTR(ret);
- if (IS_ERR(path.dentry)) {
- trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry),
+ if (ret == 0) {
+ file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG,
+ O_RDWR | O_LARGEFILE | O_DIRECT,
+ cache->cache_cred);
+ ret = PTR_ERR_OR_ZERO(file);
+ }
+ if (ret) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), ret,
cachefiles_trace_tmpfile_error);
- if (PTR_ERR(path.dentry) == -EIO)
+ if (ret == -EIO)
cachefiles_io_error_obj(object, "Failed to create tmpfile");
- file = ERR_CAST(path.dentry);
- goto out;
+ goto err;
}
- trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry));
+ trace_cachefiles_tmpfile(object, file_inode(file));
- if (!cachefiles_mark_inode_in_use(object, path.dentry)) {
- file = ERR_PTR(-EBUSY);
- goto out_dput;
- }
+ /* This is a newly created file with no other possible user */
+ if (!cachefiles_mark_inode_in_use(object, file_inode(file)))
+ WARN_ON(1);
ret = cachefiles_ondemand_init_object(object);
- if (ret < 0) {
- file = ERR_PTR(ret);
- goto out_unuse;
- }
+ if (ret < 0)
+ goto err_unuse;
ni_size = object->cookie->object_size;
ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
if (ni_size > 0) {
- trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
+ trace_cachefiles_trunc(object, file_inode(file), 0, ni_size,
cachefiles_trunc_expand_tmpfile);
ret = cachefiles_inject_write_error();
if (ret == 0)
- ret = vfs_truncate(&path, ni_size);
+ ret = vfs_truncate(&file->f_path, ni_size);
if (ret < 0) {
trace_cachefiles_vfs_error(
- object, d_backing_inode(path.dentry), ret,
+ object, file_inode(file), ret,
cachefiles_trace_trunc_error);
- file = ERR_PTR(ret);
- goto out_unuse;
+ goto err_unuse;
}
}
- file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(path.dentry), cache->cache_cred);
- if (IS_ERR(file)) {
- trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry),
- PTR_ERR(file),
- cachefiles_trace_open_error);
- goto out_unuse;
- }
+ ret = -EINVAL;
if (unlikely(!file->f_op->read_iter) ||
unlikely(!file->f_op->write_iter)) {
fput(file);
pr_notice("Cache does not support read_iter and write_iter\n");
- file = ERR_PTR(-EINVAL);
- goto out_unuse;
+ goto err_unuse;
}
-
- goto out_dput;
-
-out_unuse:
- cachefiles_do_unmark_inode_in_use(object, path.dentry);
-out_dput:
- dput(path.dentry);
out:
cachefiles_end_secure(cache, saved_cred);
return file;
+
+err_unuse:
+ cachefiles_do_unmark_inode_in_use(object, file_inode(file));
+ fput(file);
+err:
+ file = ERR_PTR(ret);
+ goto out;
}
/*
@@ -569,8 +548,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
_enter("%pd", dentry);
- if (!cachefiles_mark_inode_in_use(object, dentry))
+ if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) {
+ pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+ dentry, d_inode(dentry)->i_ino);
return false;
+ }
/* We need to open a file interface onto a data file now as we can't do
* it on demand because writeback called from do_exit() sees
@@ -624,7 +606,7 @@ check_failed:
error_fput:
fput(file);
error:
- cachefiles_do_unmark_inode_in_use(object, dentry);
+ cachefiles_do_unmark_inode_in_use(object, d_inode(dentry));
dput(dentry);
return false;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 53cfe026b3ea..fb023f9fafcb 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
+ wake_up_all(&ci->i_cap_wq);
}
/*
@@ -2285,7 +2286,7 @@ retry:
struct ceph_mds_request *req;
int i;
- sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL);
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
if (!sessions) {
err = -ENOMEM;
goto out;
@@ -2759,13 +2760,17 @@ again:
* on transition from wanted -> needed caps. This is needed
* for WRBUFFER|WR -> WR to avoid a new WR sync write from
* going before a prior buffered writeback happens.
+ *
+ * For RDCACHE|RD -> RD, there is not need to wait and we can
+ * just exclude the revoking caps and force to sync read.
*/
int not = want & ~(have & need);
int revoking = implemented & ~have;
+ int exclude = revoking & not;
dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
+ if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
(need & CEPH_CAP_FILE_WR)) {
@@ -2787,7 +2792,7 @@ again:
snap_rwsem_locked = true;
}
if ((have & want) == want)
- *got = need | want;
+ *got = need | (want & ~exclude);
else
*got = need;
ceph_take_cap_refs(ci, *got, true);
@@ -3550,6 +3555,9 @@ static void handle_cap_grant(struct inode *inode,
check_caps = 1; /* check auth cap only */
else
check_caps = 2; /* check all caps */
+ /* If there is new caps, try to wake up the waiters */
+ if (~cap->issued & newcaps)
+ wake = true;
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e0fa66ac8b9f..f780e4e0d062 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct inode *inode = __lookup_inode(sb, ino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
int err;
if (IS_ERR(inode))
@@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_PTR(err);
}
/* -ESTALE if inode as been unlinked and no file is open */
- if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
+ if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 42351d7a0dd6..4af5e55abc15 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32() % nsplits;
+ i = prandom_u32_max(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = attr->ia_ctime;
+ inode_inc_iversion_raw(inode);
}
release &= issued;
@@ -2356,6 +2357,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
goto out;
}
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) {
err = -ENOMEM;
@@ -2447,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
@@ -2476,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
if (ceph_snap(inode) == CEPH_NOSNAP)
- stat->dev = inode->i_sb->s_dev;
+ stat->dev = sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
- else
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (!parent)
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
+ else
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
stat->size = ci->i_files + ci->i_subdirs;
+ }
stat->blocks = 0;
stat->blksize = 65536;
/*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 80f8b9ec1a31..26a0a8b9975e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
+ req->r_feature_needed = -1;
kref_init(&req->r_kref);
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
@@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc,
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
+
+ /*
+ * The old ceph will crash the MDSs when see unknown OPs
+ */
+ if (req->r_feature_needed > 0 &&
+ !test_bit(req->r_feature_needed, &session->s_features)) {
+ err = -EOPNOTSUPP;
+ goto out_session;
+ }
+
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 256e3eada6c1..0598faa50e2e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -31,8 +31,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_METRIC_COLLECT,
CEPHFS_FEATURE_ALTERNATE_NAME,
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_OP_GETVXATTR,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -44,6 +45,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
+ CEPHFS_FEATURE_OP_GETVXATTR, \
}
/*
@@ -336,6 +338,8 @@ struct ceph_mds_request {
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
+ int r_feature_needed;
+
struct ceph_cap_reservation r_caps_reservation;
};
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 8d0a6d2c2da4..3fbabc98e1f7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32() % n;
+ n = prandom_u32_max(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c
index b401339f6e73..60399081046a 100644
--- a/fs/cifs/cached_dir.c
+++ b/fs/cifs/cached_dir.c
@@ -5,12 +5,99 @@
* Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
*/
+#include <linux/namei.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "smb2proto.h"
#include "cached_dir.h"
+static struct cached_fid *init_cached_dir(const char *path);
+static void free_cached_dir(struct cached_fid *cfid);
+
+static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
+ const char *path,
+ bool lookup_only)
+{
+ struct cached_fid *cfid;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (!strcmp(cfid->path, path)) {
+ /*
+ * If it doesn't have a lease it is either not yet
+ * fully cached or it may be in the process of
+ * being deleted due to a lease break.
+ */
+ if (!cfid->has_lease) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ kref_get(&cfid->refcount);
+ spin_unlock(&cfids->cfid_list_lock);
+ return cfid;
+ }
+ }
+ if (lookup_only) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ if (cfids->num_entries >= MAX_CACHED_FIDS) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ cfid = init_cached_dir(path);
+ if (cfid == NULL) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ cfid->cfids = cfids;
+ cfids->num_entries++;
+ list_add(&cfid->entry, &cfids->entries);
+ cfid->on_list = true;
+ kref_get(&cfid->refcount);
+ spin_unlock(&cfids->cfid_list_lock);
+ return cfid;
+}
+
+static struct dentry *
+path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path)
+{
+ struct dentry *dentry;
+ const char *s, *p;
+ char sep;
+
+ sep = CIFS_DIR_SEP(cifs_sb);
+ dentry = dget(cifs_sb->root);
+ s = path;
+
+ do {
+ struct inode *dir = d_inode(dentry);
+ struct dentry *child;
+
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
+
+ /* skip separators */
+ while (*s == sep)
+ s++;
+ if (!*s)
+ break;
+ p = s++;
+ /* next separator */
+ while (*s && *s != sep)
+ s++;
+
+ child = lookup_positive_unlocked(p, dentry, s - p);
+ dput(dentry);
+ dentry = child;
+ } while (!IS_ERR(dentry));
+ return dentry;
+}
+
/*
* Open the and cache a directory handle.
* If error then *cfid is not initialized.
@@ -31,54 +118,57 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct kvec qi_iov[1];
int rc, flags = 0;
- __le16 utf16_path = 0; /* Null - since an open of top of share */
+ __le16 *utf16_path = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_II;
struct cifs_fid *pfid;
- struct dentry *dentry;
+ struct dentry *dentry = NULL;
struct cached_fid *cfid;
+ struct cached_fids *cfids;
- if (tcon == NULL || tcon->nohandlecache ||
+ if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
is_smb1_server(tcon->ses->server))
return -EOPNOTSUPP;
ses = tcon->ses;
server = ses->server;
+ cfids = tcon->cfids;
- if (cifs_sb->root == NULL)
- return -ENOENT;
+ if (!server->ops->new_lease_key)
+ return -EIO;
- if (strlen(path))
+ if (cifs_sb->root == NULL)
return -ENOENT;
- dentry = cifs_sb->root;
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
- cfid = tcon->cfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->is_valid) {
- cifs_dbg(FYI, "found a cached root file handle\n");
+ cfid = find_or_create_cached_dir(cfids, path, lookup_only);
+ if (cfid == NULL) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+ /*
+ * At this point we either have a lease already and we can just
+ * return it. If not we are guaranteed to be the only thread accessing
+ * this cfid.
+ */
+ if (cfid->has_lease) {
*ret_cfid = cfid;
- kref_get(&cfid->refcount);
- mutex_unlock(&cfid->fid_mutex);
+ kfree(utf16_path);
return 0;
}
/*
* We do not hold the lock for the open because in case
- * SMB2_open needs to reconnect, it will end up calling
- * cifs_mark_open_files_invalid() which takes the lock again
- * thus causing a deadlock
+ * SMB2_open needs to reconnect.
+ * This is safe because no other thread will be able to get a ref
+ * to the cfid until we have finished opening the file and (possibly)
+ * acquired a lease.
*/
- mutex_unlock(&cfid->fid_mutex);
-
- if (lookup_only)
- return -ENOENT;
-
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- if (!server->ops->new_lease_key)
- return -EIO;
-
pfid = &cfid->fid;
server->ops->new_lease_key(pfid);
@@ -99,7 +189,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.reconnect = false;
rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, &utf16_path);
+ &rqst[0], &oplock, &oparms, utf16_path);
if (rc)
goto oshr_free;
smb2_set_next_command(tcon, &rqst[0]);
@@ -122,47 +212,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
- mutex_lock(&cfid->fid_mutex);
-
- /*
- * Now we need to check again as the cached root might have
- * been successfully re-opened from a concurrent process
- */
-
- if (cfid->is_valid) {
- /* work was already done */
-
- /* stash fids for close() later */
- struct cifs_fid fid = {
- .persistent_fid = pfid->persistent_fid,
- .volatile_fid = pfid->volatile_fid,
- };
-
- /*
- * caller expects this func to set the fid in cfid to valid
- * cached root, so increment the refcount.
- */
- kref_get(&cfid->refcount);
-
- mutex_unlock(&cfid->fid_mutex);
-
- if (rc == 0) {
- /* close extra handle outside of crit sec */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- }
- rc = 0;
- goto oshr_free;
- }
-
- /* Cached root is still invalid, continue normaly */
-
if (rc) {
if (rc == -EREMCHG) {
tcon->need_reconnect = true;
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
}
- goto oshr_exit;
+ goto oshr_free;
}
atomic_inc(&tcon->num_remote_opens);
@@ -174,30 +230,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
- cfid->tcon = tcon;
- cfid->is_valid = true;
- cfid->dentry = dentry;
- dget(dentry);
- kref_init(&cfid->refcount);
+ if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+ goto oshr_free;
- /* BB TBD check to see if oplock level check can be removed below */
- if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
- /*
- * See commit 2f94a3125b87. Increment the refcount when we
- * get a lease for root, release it if lease break occurs
- */
- kref_get(&cfid->refcount);
- cfid->has_lease = true;
- smb2_parse_contexts(server, o_rsp,
- &oparms.fid->epoch,
- oparms.fid->lease_key, &oplock,
- NULL, NULL);
- } else
- goto oshr_exit;
+
+ smb2_parse_contexts(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
- goto oshr_exit;
+ goto oshr_free;
if (!smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
sizeof(struct smb2_file_all_info),
@@ -205,15 +249,42 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
(char *)&cfid->file_all_info))
cfid->file_all_info_is_valid = true;
+ if (!path[0])
+ dentry = dget(cifs_sb->root);
+ else {
+ dentry = path_to_dentry(cifs_sb, path);
+ if (IS_ERR(dentry)) {
+ rc = -ENOENT;
+ goto oshr_free;
+ }
+ }
+ cfid->dentry = dentry;
+ cfid->tcon = tcon;
cfid->time = jiffies;
+ cfid->is_open = true;
+ cfid->has_lease = true;
-oshr_exit:
- mutex_unlock(&cfid->fid_mutex);
oshr_free:
+ kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ spin_lock(&cfids->cfid_list_lock);
+ if (!cfid->has_lease) {
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ rc = -ENOENT;
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+ if (rc) {
+ free_cached_dir(cfid);
+ cfid = NULL;
+ }
+
if (rc == 0)
*ret_cfid = cfid;
@@ -225,18 +296,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
struct cached_fid **ret_cfid)
{
struct cached_fid *cfid;
+ struct cached_fids *cfids = tcon->cfids;
- cfid = tcon->cfid;
+ if (cfids == NULL)
+ return -ENOENT;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry == dentry) {
- cifs_dbg(FYI, "found a cached root file handle by dentry\n");
- *ret_cfid = cfid;
- kref_get(&cfid->refcount);
- mutex_unlock(&cfid->fid_mutex);
- return 0;
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (dentry && cfid->dentry == dentry) {
+ cifs_dbg(FYI, "found a cached root file handle by dentry\n");
+ kref_get(&cfid->refcount);
+ *ret_cfid = cfid;
+ spin_unlock(&cfids->cfid_list_lock);
+ return 0;
+ }
}
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfids->cfid_list_lock);
return -ENOENT;
}
@@ -245,63 +320,50 @@ smb2_close_cached_fid(struct kref *ref)
{
struct cached_fid *cfid = container_of(ref, struct cached_fid,
refcount);
- struct cached_dirent *dirent, *q;
- if (cfid->is_valid) {
- cifs_dbg(FYI, "clear cached root file handle\n");
- SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid);
+ spin_lock(&cfid->cfids->cfid_list_lock);
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfid->cfids->num_entries--;
}
+ spin_unlock(&cfid->cfids->cfid_list_lock);
- /*
- * We only check validity above to send SMB2_close,
- * but we still need to invalidate these entries
- * when this function is called
- */
- cfid->is_valid = false;
- cfid->file_all_info_is_valid = false;
- cfid->has_lease = false;
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- /*
- * Delete all cached dirent names
- */
- mutex_lock(&cfid->dirents.de_mutex);
- list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
- list_del(&dirent->entry);
- kfree(dirent->name);
- kfree(dirent);
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+
+ if (cfid->is_open) {
+ SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid);
}
- cfid->dirents.is_valid = 0;
- cfid->dirents.is_failed = 0;
- cfid->dirents.ctx = NULL;
- cfid->dirents.pos = 0;
- mutex_unlock(&cfid->dirents.de_mutex);
+ free_cached_dir(cfid);
}
-void close_cached_dir(struct cached_fid *cfid)
+void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *name, struct cifs_sb_info *cifs_sb)
{
- mutex_lock(&cfid->fid_mutex);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- mutex_unlock(&cfid->fid_mutex);
-}
+ struct cached_fid *cfid = NULL;
+ int rc;
-void close_cached_dir_lease_locked(struct cached_fid *cfid)
-{
+ rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid);
+ if (rc) {
+ cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name);
+ return;
+ }
+ spin_lock(&cfid->cfids->cfid_list_lock);
if (cfid->has_lease) {
cfid->has_lease = false;
kref_put(&cfid->refcount, smb2_close_cached_fid);
}
+ spin_unlock(&cfid->cfids->cfid_list_lock);
+ close_cached_dir(cfid);
}
-void close_cached_dir_lease(struct cached_fid *cfid)
+
+void close_cached_dir(struct cached_fid *cfid)
{
- mutex_lock(&cfid->fid_mutex);
- close_cached_dir_lease_locked(cfid);
- mutex_unlock(&cfid->fid_mutex);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
/*
@@ -314,34 +376,60 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
struct cached_fid *cfid;
struct cifs_tcon *tcon;
struct tcon_link *tlink;
+ struct cached_fids *cfids;
for (node = rb_first(root); node; node = rb_next(node)) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
tcon = tlink_tcon(tlink);
if (IS_ERR(tcon))
continue;
- cfid = tcon->cfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
+ cfids = tcon->cfids;
+ if (cfids == NULL)
+ continue;
+ list_for_each_entry(cfid, &cfids->entries, entry) {
dput(cfid->dentry);
cfid->dentry = NULL;
}
- mutex_unlock(&cfid->fid_mutex);
}
}
/*
- * Invalidate and close all cached dirs when a TCON has been reset
+ * Invalidate all cached dirs when a TCON has been reset
* due to a session loss.
*/
void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
{
- mutex_lock(&tcon->cfid->fid_mutex);
- tcon->cfid->is_valid = false;
- /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
- close_cached_dir_lease_locked(tcon->cfid);
- memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid));
- mutex_unlock(&tcon->cfid->fid_mutex);
+ struct cached_fids *cfids = tcon->cfids;
+ struct cached_fid *cfid, *q;
+ LIST_HEAD(entry);
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ list_move(&cfid->entry, &entry);
+ cfids->num_entries--;
+ cfid->is_open = false;
+ cfid->on_list = false;
+ /* To prevent race with smb2_cached_lease_break() */
+ kref_get(&cfid->refcount);
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ cancel_work_sync(&cfid->lease_break);
+ if (cfid->has_lease) {
+ /*
+ * We lease was never cancelled from the server so we
+ * need to drop the reference.
+ */
+ spin_lock(&cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
+ /* Drop the extra reference opened above*/
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
}
static void
@@ -350,39 +438,121 @@ smb2_cached_lease_break(struct work_struct *work)
struct cached_fid *cfid = container_of(work,
struct cached_fid, lease_break);
- close_cached_dir_lease(cfid);
+ spin_lock(&cfid->cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfid->cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
{
- if (tcon->cfid->is_valid &&
- !memcmp(lease_key,
- tcon->cfid->fid.lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- tcon->cfid->time = 0;
- INIT_WORK(&tcon->cfid->lease_break,
- smb2_cached_lease_break);
- queue_work(cifsiod_wq,
- &tcon->cfid->lease_break);
- return true;
+ struct cached_fids *cfids = tcon->cfids;
+ struct cached_fid *cfid;
+
+ if (cfids == NULL)
+ return false;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (cfid->has_lease &&
+ !memcmp(lease_key,
+ cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ cfid->time = 0;
+ /*
+ * We found a lease remove it from the list
+ * so no threads can access it.
+ */
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+
+ queue_work(cifsiod_wq,
+ &cfid->lease_break);
+ spin_unlock(&cfids->cfid_list_lock);
+ return true;
+ }
}
+ spin_unlock(&cfids->cfid_list_lock);
return false;
}
-struct cached_fid *init_cached_dir(void)
+static struct cached_fid *init_cached_dir(const char *path)
{
struct cached_fid *cfid;
- cfid = kzalloc(sizeof(*cfid), GFP_KERNEL);
+ cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC);
if (!cfid)
return NULL;
+ cfid->path = kstrdup(path, GFP_ATOMIC);
+ if (!cfid->path) {
+ kfree(cfid);
+ return NULL;
+ }
+
+ INIT_WORK(&cfid->lease_break, smb2_cached_lease_break);
+ INIT_LIST_HEAD(&cfid->entry);
INIT_LIST_HEAD(&cfid->dirents.entries);
mutex_init(&cfid->dirents.de_mutex);
- mutex_init(&cfid->fid_mutex);
+ spin_lock_init(&cfid->fid_lock);
+ kref_init(&cfid->refcount);
return cfid;
}
-void free_cached_dir(struct cifs_tcon *tcon)
+static void free_cached_dir(struct cached_fid *cfid)
+{
+ struct cached_dirent *dirent, *q;
+
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+
+ /*
+ * Delete all cached dirent names
+ */
+ list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
+ list_del(&dirent->entry);
+ kfree(dirent->name);
+ kfree(dirent);
+ }
+
+ kfree(cfid->path);
+ cfid->path = NULL;
+ kfree(cfid);
+}
+
+struct cached_fids *init_cached_dirs(void)
+{
+ struct cached_fids *cfids;
+
+ cfids = kzalloc(sizeof(*cfids), GFP_KERNEL);
+ if (!cfids)
+ return NULL;
+ spin_lock_init(&cfids->cfid_list_lock);
+ INIT_LIST_HEAD(&cfids->entries);
+ return cfids;
+}
+
+/*
+ * Called from tconInfoFree when we are tearing down the tcon.
+ * There are no active users or open files/directories at this point.
+ */
+void free_cached_dirs(struct cached_fids *cfids)
{
- kfree(tcon->cfid);
+ struct cached_fid *cfid, *q;
+ LIST_HEAD(entry);
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ cfid->on_list = false;
+ cfid->is_open = false;
+ list_move(&cfid->entry, &entry);
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ free_cached_dir(cfid);
+ }
+
+ kfree(cfids);
}
diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h
index bd262dc8b179..2f4e764c9ca9 100644
--- a/fs/cifs/cached_dir.h
+++ b/fs/cifs/cached_dir.h
@@ -31,13 +31,17 @@ struct cached_dirents {
};
struct cached_fid {
- bool is_valid:1; /* Do we have a useable root fid */
- bool file_all_info_is_valid:1;
+ struct list_head entry;
+ struct cached_fids *cfids;
+ const char *path;
bool has_lease:1;
+ bool is_open:1;
+ bool on_list:1;
+ bool file_all_info_is_valid:1;
unsigned long time; /* jiffies of when lease was taken */
struct kref refcount;
struct cifs_fid fid;
- struct mutex fid_mutex;
+ spinlock_t fid_lock;
struct cifs_tcon *tcon;
struct dentry *dentry;
struct work_struct lease_break;
@@ -45,8 +49,18 @@ struct cached_fid {
struct cached_dirents dirents;
};
-extern struct cached_fid *init_cached_dir(void);
-extern void free_cached_dir(struct cifs_tcon *tcon);
+#define MAX_CACHED_FIDS 16
+struct cached_fids {
+ /* Must be held when:
+ * - accessing the cfids->entries list
+ */
+ spinlock_t cfid_list_lock;
+ int num_entries;
+ struct list_head entries;
+};
+
+extern struct cached_fids *init_cached_dirs(void);
+extern void free_cached_dirs(struct cached_fids *cfids);
extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
const char *path,
struct cifs_sb_info *cifs_sb,
@@ -55,8 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
struct dentry *dentry,
struct cached_fid **cfid);
extern void close_cached_dir(struct cached_fid *cfid);
-extern void close_cached_dir_lease(struct cached_fid *cfid);
-extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
+extern void drop_cached_dir_by_name(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *name,
+ struct cifs_sb_info *cifs_sb);
extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index c05477e28cff..90850da390ae 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
{
__u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
- seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count);
+ seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count);
if (tcon->nativeFileSystem)
seq_printf(m, "Type: %s ", tcon->nativeFileSystem);
seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d",
@@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
i++;
- seq_printf(m, "\n%d) %s", i, tcon->treeName);
+ seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
seq_printf(m, "\nSMBs: %d",
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index ee4ea2b60c0f..d44808263cfb 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -108,8 +108,8 @@ do { \
#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \
do { \
const char *tn = ""; \
- if (tcon && tcon->treeName) \
- tn = tcon->treeName; \
+ if (tcon && tcon->tree_name) \
+ tn = tcon->tree_name; \
if ((type) & FYI && cifsFYI & CIFS_INFO) { \
pr_debug_ ## ratefunc("%s: %s " fmt, \
__FILE__, tn, ##__VA_ARGS__); \
@@ -150,7 +150,7 @@ do { \
#define cifs_tcon_dbg(type, fmt, ...) \
do { \
if (0) \
- pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \
+ pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \
} while (0)
#define cifs_info(fmt, ...) \
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index b87cbbe6d2d4..d86d78d5bfdc 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -91,6 +91,13 @@ struct smb3_notify {
bool watch_tree;
} __packed;
+struct smb3_notify_info {
+ __u32 completion_filter;
+ bool watch_tree;
+ __u32 data_len; /* size of notify data below */
+ __u8 notify_data[];
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
@@ -100,6 +107,7 @@ struct smb3_notify {
#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
+#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
/*
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 1e4c7cc5287f..7233c6a7e6d7 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon)
const char *share_name;
const char *net_name;
- net_name = extract_hostname(tcon->treeName);
+ net_name = extract_hostname(tcon->tree_name);
if (IS_ERR(net_name)) {
int ret;
ret = PTR_ERR(net_name);
cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n",
- __func__, tcon->treeName, ret);
+ __func__, tcon->tree_name, ret);
return ERR_PTR(-EINVAL);
}
- share_name = extract_sharename(tcon->treeName);
+ share_name = extract_sharename(tcon->tree_name);
if (IS_ERR(share_name)) {
int ret;
ret = PTR_ERR(share_name);
cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n",
- __func__, tcon->treeName, ret);
+ __func__, tcon->tree_name, ret);
kfree(net_name);
return ERR_PTR(-EINVAL);
}
@@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon)
goto fail;
}
- reg->net_name = extract_hostname(tcon->treeName);
+ reg->net_name = extract_hostname(tcon->tree_name);
if (IS_ERR(reg->net_name)) {
ret = PTR_ERR(reg->net_name);
cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret);
goto fail_idr;
}
- reg->share_name = extract_sharename(tcon->treeName);
+ reg->share_name = extract_sharename(tcon->tree_name);
if (IS_ERR(reg->share_name)) {
ret = PTR_ERR(reg->share_name);
cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 46f5718754f9..5db73c0f792a 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
if (!rqst->rq_iov || !signature || !server)
return -EINVAL;
- rc = cifs_alloc_hash("md5", &server->secmech.md5,
- &server->secmech.sdescmd5);
+ rc = cifs_alloc_hash("md5", &server->secmech.md5);
if (rc)
return -1;
- rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+ rc = crypto_shash_init(server->secmech.md5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
return rc;
}
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
+ rc = crypto_shash_update(server->secmech.md5,
server->session_key.response, server->session_key.len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
- return __cifs_calc_signature(rqst, server, signature,
- &server->secmech.sdescmd5->shash);
+ return __cifs_calc_signature(rqst, server, signature, server->secmech.md5);
}
/* must be called with server->srv_mutex held */
@@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
wchar_t *domain;
wchar_t *server;
- if (!ses->server->secmech.sdeschmacmd5) {
+ if (!ses->server->secmech.hmacmd5) {
cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
@@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash, nls_cp);
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash,
CIFS_NTHASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
return rc;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
return rc;
@@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
memset(user, '\0', 2);
}
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)user, 2 * len);
kfree(user);
if (rc) {
@@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
nls_cp);
rc =
- crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)domain, 2 * len);
kfree(domain);
if (rc) {
@@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len,
nls_cp);
rc =
- crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)server, 2 * len);
kfree(server);
if (rc) {
@@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
}
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ntlmv2_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
offsetof(struct ntlmv2_resp, challenge.key[0]));
- if (!ses->server->secmech.sdeschmacmd5) {
+ if (!ses->server->secmech.hmacmd5) {
cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
@@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
return rc;
@@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
else
memcpy(ntlmv2->challenge.key,
ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
ntlmv2->challenge.key, hash_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
@@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
}
/* Note that the MD5 digest over writes anon.challenge_key.key */
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ntlmv2->ntlmv2_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_server_lock(ses->server);
- rc = cifs_alloc_hash("hmac(md5)",
- &ses->server->secmech.hmacmd5,
- &ses->server->secmech.sdeschmacmd5);
+ rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5);
if (rc) {
goto unlock;
}
@@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
/* now calculate the session key for NTLMv2 */
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
@@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
goto unlock;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
goto unlock;
}
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
ntlmv2->ntlmv2_hash,
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
@@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
goto unlock;
}
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ses->auth_key.response);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -679,7 +675,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
unlock:
cifs_server_unlock(ses->server);
setup_ntlmv2_rsp_ret:
- kfree(tiblob);
+ kfree_sensitive(tiblob);
return rc;
}
@@ -718,49 +714,19 @@ calc_seckey(struct cifs_ses *ses)
void
cifs_crypto_secmech_release(struct TCP_Server_Info *server)
{
- if (server->secmech.cmacaes) {
- crypto_free_shash(server->secmech.cmacaes);
- server->secmech.cmacaes = NULL;
- }
-
- if (server->secmech.hmacsha256) {
- crypto_free_shash(server->secmech.hmacsha256);
- server->secmech.hmacsha256 = NULL;
- }
-
- if (server->secmech.md5) {
- crypto_free_shash(server->secmech.md5);
- server->secmech.md5 = NULL;
- }
+ cifs_free_hash(&server->secmech.aes_cmac);
+ cifs_free_hash(&server->secmech.hmacsha256);
+ cifs_free_hash(&server->secmech.md5);
+ cifs_free_hash(&server->secmech.sha512);
+ cifs_free_hash(&server->secmech.hmacmd5);
- if (server->secmech.sha512) {
- crypto_free_shash(server->secmech.sha512);
- server->secmech.sha512 = NULL;
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
}
- if (server->secmech.hmacmd5) {
- crypto_free_shash(server->secmech.hmacmd5);
- server->secmech.hmacmd5 = NULL;
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
+ server->secmech.dec = NULL;
}
-
- if (server->secmech.ccmaesencrypt) {
- crypto_free_aead(server->secmech.ccmaesencrypt);
- server->secmech.ccmaesencrypt = NULL;
- }
-
- if (server->secmech.ccmaesdecrypt) {
- crypto_free_aead(server->secmech.ccmaesdecrypt);
- server->secmech.ccmaesdecrypt = NULL;
- }
-
- kfree(server->secmech.sdesccmacaes);
- server->secmech.sdesccmacaes = NULL;
- kfree(server->secmech.sdeschmacsha256);
- server->secmech.sdeschmacsha256 = NULL;
- kfree(server->secmech.sdeschmacmd5);
- server->secmech.sdeschmacmd5 = NULL;
- kfree(server->secmech.sdescmd5);
- server->secmech.sdescmd5 = NULL;
- kfree(server->secmech.sdescsha512);
- server->secmech.sdescsha512 = NULL;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8042d7280dec..d0b9fec111aa 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->epoch = 0;
spin_lock_init(&cifs_inode->open_file_lock);
generate_random_uuid(cifs_inode->lease_key);
+ cifs_inode->symlink_target = NULL;
/*
* Can not set i_flags here - they get immediately overwritten to zero
@@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb)
static void
cifs_free_inode(struct inode *inode)
{
- kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(cinode->symlink_target);
+ kmem_cache_free(cifs_inode_cachep, cinode);
}
static void
@@ -1139,7 +1144,7 @@ const struct inode_operations cifs_file_inode_ops = {
};
const struct inode_operations cifs_symlink_inode_ops = {
- .get_link = cifs_get_link,
+ .get_link = simple_get_link,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
@@ -1297,8 +1302,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
ssize_t rc;
struct cifsFileInfo *cfile = dst_file->private_data;
- if (cfile->swapfile)
- return -EOPNOTSUPP;
+ if (cfile->swapfile) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5b4a7a32bdc5..388b745a978e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 39
-#define CIFS_VERSION "2.39"
+#define SMB3_PRODUCT_BUILD 40
+#define CIFS_VERSION "2.40"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ae7f571a7dba..1420acf987f0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -153,26 +153,16 @@ struct session_key {
char *response;
};
-/* crypto security descriptor definition */
-struct sdesc {
- struct shash_desc shash;
- char ctx[];
-};
-
/* crypto hashing related structure/fields, not specific to a sec mech */
struct cifs_secmech {
- struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
- struct crypto_shash *md5; /* md5 hash function */
- struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
- struct crypto_shash *cmacaes; /* block-cipher based MAC function */
- struct crypto_shash *sha512; /* sha512 hash function */
- struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
- struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
- struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
- struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
- struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */
- struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */
- struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */
+ struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */
+ struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */
+ struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */
+ struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */
+ struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */
+
+ struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */
+ struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */
};
/* per smb session structure/fields */
@@ -195,6 +185,19 @@ struct cifs_cred {
struct cifs_ace *aces;
};
+struct cifs_open_info_data {
+ char *symlink_target;
+ union {
+ struct smb2_file_all_info fi;
+ struct smb311_posix_qinfo posix_fi;
+ };
+};
+
+static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+{
+ kfree(data->symlink_target);
+}
+
/*
*****************************************************************
* Except the CIFS PDUs themselves all the
@@ -317,20 +320,20 @@ struct smb_version_operations {
int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *);
/* query path data from the server */
- int (*query_path_info)(const unsigned int, struct cifs_tcon *,
- struct cifs_sb_info *, const char *,
- FILE_ALL_INFO *, bool *, bool *);
+ int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
/* query file data from the server */
- int (*query_file_info)(const unsigned int, struct cifs_tcon *,
- struct cifs_fid *, FILE_ALL_INFO *);
+ int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data);
/* query reparse tag from srv to determine which type of special file */
int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
/* get server index number */
- int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
- struct cifs_sb_info *, const char *,
- u64 *uniqueid, FILE_ALL_INFO *);
+ int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid,
+ struct cifs_open_info_data *data);
/* set size by path */
int (*set_path_size)(const unsigned int, struct cifs_tcon *,
const char *, __u64, struct cifs_sb_info *, bool);
@@ -379,8 +382,8 @@ struct smb_version_operations {
struct cifs_sb_info *, const char *,
char **, bool);
/* open a file for non-posix mounts */
- int (*open)(const unsigned int, struct cifs_open_parms *,
- __u32 *, FILE_ALL_INFO *);
+ int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf);
/* set fid protocol-specific info */
void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
/* close a file */
@@ -451,7 +454,7 @@ struct smb_version_operations {
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file, void __user *);
int (*notify)(const unsigned int xid, struct file *pfile,
- void __user *pbuf);
+ void __user *pbuf, bool return_changes);
int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const unsigned char *,
char *, unsigned int *);
@@ -1133,6 +1136,7 @@ struct cifs_fattr {
struct timespec64 cf_mtime;
struct timespec64 cf_ctime;
u32 cf_cifstag;
+ char *cf_symlink_target;
};
/*
@@ -1149,7 +1153,7 @@ struct cifs_tcon {
struct list_head openFileList;
spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
- char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
+ char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
char *nativeFileSystem;
char *password; /* for share-level security */
__u32 tid; /* The 4 byte tree id */
@@ -1228,7 +1232,7 @@ struct cifs_tcon {
struct fscache_volume *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
- struct cached_fid *cfid; /* Cached root fid */
+ struct cached_fids *cfids;
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
@@ -1395,6 +1399,7 @@ struct cifsFileInfo {
struct work_struct put; /* work for the final part of _put */
struct delayed_work deferred;
bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
+ char *symlink_target;
};
struct cifs_io_parms {
@@ -1553,6 +1558,7 @@ struct cifsInodeInfo {
struct list_head deferred_closes; /* list of deferred closes */
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
+ char *symlink_target;
};
static inline struct cifsInodeInfo *
@@ -2121,4 +2127,14 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
return sizeof(ses->workstation_name);
}
+static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src)
+{
+ memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src));
+ dst->AccessFlags = src->AccessFlags;
+ dst->CurrentByteOffset = src->CurrentByteOffset;
+ dst->Mode = src->Mode;
+ dst->AlignmentRequirement = src->AlignmentRequirement;
+ dst->FileNameLength = src->FileNameLength;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index aeba371c4c70..d1abaeea974a 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr)
typedef struct negotiate_req {
struct smb_hdr hdr; /* wct = 0 */
__le16 ByteCount;
- unsigned char DialectsArray[1];
+ unsigned char DialectsArray[];
} __attribute__((packed)) NEGOTIATE_REQ;
#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
@@ -508,13 +508,14 @@ typedef struct negotiate_rsp {
__u8 EncryptionKeyLength;
__u16 ByteCount;
union {
- unsigned char EncryptionKey[1]; /* cap extended security off */
+ /* cap extended security off */
+ DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey);
/* followed by Domain name - if extended security is off */
/* followed by 16 bytes of server GUID */
/* then security blob if cap_extended_security negotiated */
struct {
unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
- unsigned char SecurityBlob[1];
+ unsigned char SecurityBlob[];
} __attribute__((packed)) extended_response;
} __attribute__((packed)) u;
} __attribute__((packed)) NEGOTIATE_RSP;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3bc94bcc7177..83e83d8beabb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
extern void cifs_down_write(struct rw_semaphore *sem);
-extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
- struct file *file,
- struct tcon_link *tlink,
- __u32 oplock);
+struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
+ struct tcon_link *tlink, __u32 oplock,
+ const char *symlink_target);
extern int cifs_posix_open(const char *full_path, struct inode **inode,
struct super_block *sb, int mode,
unsigned int f_flags, __u32 *oplock, __u16 *netfid,
@@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
extern struct inode *cifs_iget(struct super_block *sb,
struct cifs_fattr *fattr);
-extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
- FILE_ALL_INFO *data, struct super_block *sb,
- int xid, const struct cifs_fid *fid);
+int cifs_get_inode_info(struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct super_block *sb, int xid,
+ const struct cifs_fid *fid);
extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
struct super_block *sb, unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
@@ -598,9 +597,8 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
-int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
- struct sdesc **sdesc);
-void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
+int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
+void cifs_free_hash(struct shash_desc **sdesc);
extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset);
@@ -639,7 +637,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
-SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7aa91e272027..1724066c1536 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid,
for (i = 0; i < CIFS_NUM_PROT; i++) {
size_t len = strlen(protocols[i].name) + 1;
- memcpy(pSMB->DialectsArray+count, protocols[i].name, len);
+ memcpy(&pSMB->DialectsArray[count], protocols[i].name, len);
count += len;
}
inc_rfc1001_len(pSMB, count);
@@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
remap);
}
rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
- count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
+ count = sizeof(struct set_file_rename) + (2 * len_of_str);
byte_count += count;
pSMB->DataCount = cpu_to_le16(count);
pSMB->TotalDataCount = pSMB->DataCount;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae6f2c08153..ffb291579bb9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work)
/*
* query server network interfaces, in case they change
*/
- rc = SMB3_request_interfaces(0, tcon);
+ rc = SMB3_request_interfaces(0, tcon, false);
if (rc) {
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
@@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
}
server->sequence_number = 0;
server->session_estab = false;
- kfree(server->session_key.response);
+ kfree_sensitive(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
server->lstrp = jiffies;
@@ -1580,7 +1580,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cifs_crypto_secmech_release(server);
- kfree(server->session_key.response);
+ kfree_sensitive(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
kfree(server->hostname);
@@ -1940,7 +1940,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&ses->ses_lock);
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
- cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE");
+ cifs_dbg(FYI,
+ "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE");
spin_lock(&cifs_tcp_ses_lock);
if (--ses->ses_count > 0) {
@@ -2293,7 +2294,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
if (tcon->status == TID_EXITING)
return 0;
- if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE))
+ if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
return 0;
if (tcon->seal != ctx->seal)
return 0;
@@ -2831,9 +2832,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* sessinit is sent but no second negprot
*/
struct rfc1002_session_packet *ses_init_buf;
+ unsigned int req_noscope_len;
struct smb_hdr *smb_buf;
+
ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
GFP_KERNEL);
+
if (ses_init_buf) {
ses_init_buf->trailer.session_req.called_len = 32;
@@ -2869,8 +2873,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
ses_init_buf->trailer.session_req.scope2 = 0;
smb_buf = (struct smb_hdr *)ses_init_buf;
- /* sizeof RFC1002_SESSION_REQUEST with no scope */
- smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
+ /* sizeof RFC1002_SESSION_REQUEST with no scopes */
+ req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
+
+ /* == cpu_to_be32(0x81000044) */
+ smb_buf->smb_buf_length =
+ cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
rc = smb_send(server, smb_buf, 0x44);
kfree(ses_init_buf);
/*
@@ -3921,12 +3929,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
pSMB->AndXCommand = 0xFF;
pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
bcc_ptr = &pSMB->Password[0];
- if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) {
- pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
- *bcc_ptr = 0; /* password is null byte */
- bcc_ptr++; /* skip password */
- /* already aligned so no need to do it below */
- }
+
+ pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
+ *bcc_ptr = 0; /* password is null byte */
+ bcc_ptr++; /* skip password */
+ /* already aligned so no need to do it below */
if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -3989,7 +3996,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -4134,7 +4141,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (ses->auth_key.response) {
cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
ses->auth_key.response);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
ses->auth_key.len = 0;
}
@@ -4197,7 +4204,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ctx->local_nls = cifs_sb->local_nls;
ctx->linux_uid = fsuid;
ctx->cred_uid = fsuid;
- ctx->UNC = master_tcon->treeName;
+ ctx->UNC = master_tcon->tree_name;
ctx->retry = master_tcon->retry;
ctx->nocase = master_tcon->nocase;
ctx->nohandlecache = master_tcon->nohandlecache;
@@ -4663,7 +4670,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
if (!server->current_fullpath ||
dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
- rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls);
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
goto out;
}
@@ -4707,7 +4714,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
tcon->status = TID_IN_TCON;
spin_unlock(&tcon->tc_lock);
- rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc);
if (rc) {
spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index a9b6c3eba6de..e70915ad7541 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c
get_ipc_unc(path, unc, sizeof(unc));
for (; *ses; ses++) {
- if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName))
+ if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name))
return *ses;
}
return ERR_PTR(-ENOENT);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 08f7392716e2..8b1c37158556 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s
}
if (add_treename)
- dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
@@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s
return full_path;
if (dfsplen)
- memcpy(full_path, tcon->treeName, dfsplen);
+ memcpy(full_path, tcon->tree_name, dfsplen);
full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
memcpy(full_path + dfsplen + 1, ctx->prepath, pplen);
convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
@@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return ERR_PTR(-ENOMEM);
if (prefix)
- dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
@@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
}
if (dfsplen) {
s -= dfsplen;
- memcpy(s, tcon->treeName, dfsplen);
+ memcpy(s, tcon->tree_name, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
@@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon)
/* Inode operations in similar order to how they appear in Linux file fs.h */
-static int
-cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
- struct tcon_link *tlink, unsigned oflags, umode_t mode,
- __u32 *oplock, struct cifs_fid *fid)
+static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
+ struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock,
+ struct cifs_fid *fid, struct cifs_open_info_data *buf)
{
int rc = -ENOENT;
int create_options = CREATE_NOT_DIR;
@@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
struct cifs_tcon *tcon = tlink_tcon(tlink);
const char *full_path;
void *page = alloc_dentry_path();
- FILE_ALL_INFO *buf = NULL;
struct inode *newinode = NULL;
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
@@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto out;
}
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
/*
* if we're not using unix extensions, see if we need to set
* ATTR_READONLY on the create call
@@ -364,8 +356,7 @@ cifs_create_get_file_info:
{
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* TODO: Add support for calling POSIX query info here, but passing in fid */
- rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
- xid, fid);
+ rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid);
if (newinode) {
if (server->ops->set_lease_key)
server->ops->set_lease_key(newinode, fid);
@@ -402,7 +393,6 @@ cifs_create_set_dentry:
d_add(direntry, newinode);
out:
- kfree(buf);
free_dentry_path(page);
return rc;
@@ -423,10 +413,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
__u32 oplock;
struct cifsFileInfo *file_info;
+ struct cifs_open_info_data buf = {};
if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
return -EIO;
@@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
cifs_add_pending_open(&fid, tlink, &open);
rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid);
-
+ &oplock, &fid, &buf);
if (rc) {
cifs_del_pending_open(&open);
goto out;
@@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
file->f_op = &cifs_file_direct_ops;
}
- file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
+ file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target);
if (file_info == NULL) {
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
@@ -526,6 +516,7 @@ out:
cifs_put_tlink(tlink);
out_free_xid:
free_xid(xid);
+ cifs_free_open_info(&buf);
return rc;
}
@@ -547,12 +538,15 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
struct TCP_Server_Info *server;
struct cifs_fid fid;
__u32 oplock;
+ struct cifs_open_info_data buf = {};
cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
inode, direntry, direntry);
- if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
- return -EIO;
+ if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) {
+ rc = -EIO;
+ goto out_free_xid;
+ }
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
@@ -565,11 +559,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
if (server->ops->new_lease_key)
server->ops->new_lease_key(&fid);
- rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid);
+ rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf);
if (!rc && server->ops->close)
server->ops->close(xid, tcon, &fid);
+ cifs_free_open_info(&buf);
cifs_put_tlink(tlink);
out_free_xid:
free_xid(xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6f38b134a346..5b3b308e115c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -209,16 +209,14 @@ posix_open_ret:
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-static int
-cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
- struct cifs_fid *fid, unsigned int xid)
+static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
+ struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf)
{
int rc;
int desired_access;
int disposition;
int create_options = CREATE_NOT_DIR;
- FILE_ALL_INFO *buf;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
@@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
/* BB pass O_SYNC flag through on file attributes .. BB */
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
/* O_SYNC also has bit for O_DSYNC so following check picks up either */
if (f_flags & O_SYNC)
create_options |= CREATE_WRITE_THROUGH;
@@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
oparms.reconnect = false;
rc = server->ops->open(xid, &oparms, oplock, buf);
-
if (rc)
- goto out;
+ return rc;
/* TODO: Add support for calling posix query info but with passing in fid */
if (tcon->unix_ext)
@@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
rc = -EOPENSTALE;
}
-out:
- kfree(buf);
return rc;
}
@@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem)
static void cifsFileInfo_put_work(struct work_struct *work);
-struct cifsFileInfo *
-cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
- struct tcon_link *tlink, __u32 oplock)
+struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
+ struct tcon_link *tlink, __u32 oplock,
+ const char *symlink_target)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
@@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
return NULL;
}
+ if (symlink_target) {
+ cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL);
+ if (!cfile->symlink_target) {
+ kfree(fdlocks);
+ kfree(cfile);
+ return NULL;
+ }
+ }
+
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
@@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
cifs_put_tlink(cifs_file->tlink);
dput(cifs_file->dentry);
cifs_sb_deactive(sb);
+ kfree(cifs_file->symlink_target);
kfree(cifs_file);
}
@@ -488,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct super_block *sb = inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
bool oplock_break_cancelled;
@@ -570,8 +571,9 @@ int cifs_open(struct inode *inode, struct file *file)
void *page;
const char *full_path;
bool posix_open_ok = false;
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
+ struct cifs_open_info_data data = {};
xid = get_xid();
@@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file)
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &fid);
- rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
- file->f_flags, &oplock, &fid, xid);
+ rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid,
+ xid, &data);
if (rc) {
cifs_del_pending_open(&open);
goto out;
}
}
- cfile = cifs_new_fileinfo(&fid, file, tlink, oplock);
+ cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target);
if (cfile == NULL) {
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
@@ -712,6 +714,7 @@ out:
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
+ cifs_free_open_info(&data);
return rc;
}
@@ -1882,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
struct cifsFileInfo *cfile;
__u32 type;
- rc = -EACCES;
xid = get_xid();
- if (!(fl->fl_flags & FL_FLOCK))
- return -ENOLCK;
+ if (!(fl->fl_flags & FL_FLOCK)) {
+ rc = -ENOLCK;
+ free_xid(xid);
+ return rc;
+ }
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -1905,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
* if no lock or unlock then nothing to do since we do not
* know what it is
*/
+ rc = -EOPNOTSUPP;
free_xid(xid);
- return -EOPNOTSUPP;
+ return rc;
}
rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock,
@@ -4271,6 +4277,15 @@ static ssize_t __cifs_readv(
len = ctx->len;
}
+ if (direct) {
+ rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
+ offset, offset + len - 1);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EAGAIN;
+ }
+ }
+
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 0e13dec86b25..45119597c765 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -791,6 +791,13 @@ do { \
cifs_sb->ctx->field = NULL; \
} while (0)
+#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \
+do { \
+ kfree_sensitive(ctx->field); \
+ ctx->field = cifs_sb->ctx->field; \
+ cifs_sb->ctx->field = NULL; \
+} while (0)
+
static int smb3_reconfigure(struct fs_context *fc)
{
struct smb3_fs_context *ctx = smb3_fc2context(fc);
@@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, UNC);
STEAL_STRING(cifs_sb, ctx, source);
STEAL_STRING(cifs_sb, ctx, username);
- STEAL_STRING(cifs_sb, ctx, password);
+ STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -1162,7 +1169,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
break;
case Opt_pass:
- kfree(ctx->password);
+ kfree_sensitive(ctx->password);
ctx->password = NULL;
if (strlen(param->string) == 0)
break;
@@ -1470,6 +1477,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
return 0;
cifs_parse_mount_err:
+ kfree_sensitive(ctx->password);
return -EINVAL;
}
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 23ef56f55ce5..a1751b956318 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
memset(&key, 0, sizeof(key));
- sharename = extract_sharename(tcon->treeName);
+ sharename = extract_sharename(tcon->tree_name);
if (IS_ERR(sharename)) {
cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
return -EINVAL;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bac08c20f559..9bde08d44617 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -210,6 +210,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
*/
inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
}
+
+ if (S_ISLNK(fattr->cf_mode)) {
+ kfree(cifs_i->symlink_target);
+ cifs_i->symlink_target = fattr->cf_symlink_target;
+ fattr->cf_symlink_target = NULL;
+
+ if (unlikely(!cifs_i->symlink_target))
+ inode->i_link = ERR_PTR(-EOPNOTSUPP);
+ else
+ inode->i_link = cifs_i->symlink_target;
+ }
spin_unlock(&inode->i_lock);
if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
@@ -347,13 +358,22 @@ cifs_get_file_info_unix(struct file *filp)
int rc;
unsigned int xid;
FILE_UNIX_BASIC_INFO find_data;
- struct cifs_fattr fattr;
+ struct cifs_fattr fattr = {};
struct inode *inode = file_inode(filp);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
xid = get_xid();
+
+ if (cfile->symlink_target) {
+ fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!fattr.cf_symlink_target) {
+ rc = -ENOMEM;
+ goto cifs_gfiunix_out;
+ }
+ }
+
rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data);
if (!rc) {
cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -378,6 +398,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
FILE_UNIX_BASIC_INFO find_data;
struct cifs_fattr fattr;
struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -387,10 +408,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/* could have done a find first instead but this returns more info */
rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
cifs_sb->local_nls, cifs_remap(cifs_sb));
+ cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc);
cifs_put_tlink(tlink);
if (!rc) {
@@ -410,6 +433,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
+ if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) {
+ if (!server->ops->query_symlink)
+ return -EOPNOTSUPP;
+ rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
+ &fattr.cf_symlink_target, false);
+ if (rc) {
+ cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
+ goto cgiiu_exit;
+ }
+ }
+
if (*pinode == NULL) {
/* get new inode */
cifs_fill_uniqueid(sb, &fattr);
@@ -432,6 +466,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
}
cgiiu_exit:
+ kfree(fattr.cf_symlink_target);
return rc;
}
#else
@@ -601,10 +636,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
}
/* Fill a cifs_fattr struct with info from POSIX info struct */
-static void
-smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info,
- struct super_block *sb, bool adjust_tz, bool symlink)
+static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct super_block *sb, bool adjust_tz, bool symlink)
{
+ struct smb311_posix_qinfo *info = &data->posix_fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -639,6 +674,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *
if (symlink) {
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode |= S_IFDIR;
fattr->cf_dtype = DT_DIR;
@@ -655,13 +692,11 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
}
-
-/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-static void
-cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct super_block *sb, bool adjust_tz,
- bool symlink, u32 reparse_tag)
+static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct super_block *sb, bool adjust_tz, bool symlink,
+ u32 reparse_tag)
{
+ struct smb2_file_all_info *info = &data->fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -703,7 +738,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
} else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) {
fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
fattr->cf_dtype = DT_BLK;
- } else if (symlink) { /* TODO add more reparse tag checks */
+ } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK ||
+ reparse_tag == IO_REPARSE_TAG_NFS) {
fattr->cf_mode = S_IFLNK;
fattr->cf_dtype = DT_LNK;
} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
@@ -735,6 +771,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
}
}
+ if (S_ISLNK(fattr->cf_mode)) {
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
+ }
+
fattr->cf_uid = cifs_sb->ctx->linux_uid;
fattr->cf_gid = cifs_sb->ctx->linux_gid;
}
@@ -744,23 +785,28 @@ cifs_get_file_info(struct file *filp)
{
int rc;
unsigned int xid;
- FILE_ALL_INFO find_data;
+ struct cifs_open_info_data data = {};
struct cifs_fattr fattr;
struct inode *inode = file_inode(filp);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+ bool symlink = false;
+ u32 tag = 0;
if (!server->ops->query_file_info)
return -ENOSYS;
xid = get_xid();
- rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
+ rc = server->ops->query_file_info(xid, tcon, cfile, &data);
switch (rc) {
case 0:
/* TODO: add support to query reparse tag */
- cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
- false, 0 /* no reparse tag */);
+ if (data.symlink_target) {
+ symlink = true;
+ tag = IO_REPARSE_TAG_SYMLINK;
+ }
+ cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag);
break;
case -EREMOTE:
cifs_create_dfs_fattr(&fattr, inode->i_sb);
@@ -789,6 +835,7 @@ cifs_get_file_info(struct file *filp)
/* if filetype is different, return error */
rc = cifs_fattr_to_inode(inode, &fattr);
cgfi_exit:
+ cifs_free_open_info(&data);
free_xid(xid);
return rc;
}
@@ -860,14 +907,9 @@ cifs_backup_query_path_info(int xid,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-static void
-cifs_set_fattr_ino(int xid,
- struct cifs_tcon *tcon,
- struct super_block *sb,
- struct inode **inode,
- const char *full_path,
- FILE_ALL_INFO *data,
- struct cifs_fattr *fattr)
+static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb,
+ struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct cifs_fattr *fattr)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -885,11 +927,8 @@ cifs_set_fattr_ino(int xid,
* If we have an inode pass a NULL tcon to ensure we don't
* make a round trip to the server. This only works for SMB2+.
*/
- rc = server->ops->get_srv_inum(xid,
- *inode ? NULL : tcon,
- cifs_sb, full_path,
- &fattr->cf_uniqueid,
- data);
+ rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path,
+ &fattr->cf_uniqueid, data);
if (rc) {
/*
* If that fails reuse existing ino or generate one
@@ -913,7 +952,7 @@ cifs_set_fattr_ino(int xid,
} else {
/* make an ino by hashing the UNC */
fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO;
- fattr->cf_uniqueid = simple_hashstr(tcon->treeName);
+ fattr->cf_uniqueid = simple_hashstr(tcon->tree_name);
}
}
}
@@ -923,14 +962,10 @@ static inline bool is_inode_cache_good(struct inode *ino)
return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0;
}
-int
-cifs_get_inode_info(struct inode **inode,
- const char *full_path,
- FILE_ALL_INFO *in_data,
- struct super_block *sb, int xid,
- const struct cifs_fid *fid)
+int cifs_get_inode_info(struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct super_block *sb, int xid,
+ const struct cifs_fid *fid)
{
-
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct tcon_link *tlink;
@@ -938,8 +973,7 @@ cifs_get_inode_info(struct inode **inode,
bool adjust_tz = false;
struct cifs_fattr fattr = {0};
bool is_reparse_point = false;
- FILE_ALL_INFO *data = in_data;
- FILE_ALL_INFO *tmp_data = NULL;
+ struct cifs_open_info_data tmp_data = {};
void *smb1_backup_rsp_buf = NULL;
int rc = 0;
int tmprc = 0;
@@ -960,21 +994,15 @@ cifs_get_inode_info(struct inode **inode,
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto out;
}
- tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (!tmp_data) {
- rc = -ENOMEM;
- goto out;
- }
- rc = server->ops->query_path_info(xid, tcon, cifs_sb,
- full_path, tmp_data,
- &adjust_tz, &is_reparse_point);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data,
+ &adjust_tz, &is_reparse_point);
#ifdef CONFIG_CIFS_DFS_UPCALL
if (rc == -ENOENT && is_tcon_dfs(tcon))
rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon,
cifs_sb,
full_path);
#endif
- data = tmp_data;
+ data = &tmp_data;
}
/*
@@ -988,14 +1016,24 @@ cifs_get_inode_info(struct inode **inode,
* since we have to check if its reparse tag matches a known
* special file type e.g. symlink or fifo or char etc.
*/
- if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) &&
- server->ops->query_reparse_tag) {
- rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb,
- full_path, &reparse_tag);
- cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag);
+ if (is_reparse_point && data->symlink_target) {
+ reparse_tag = IO_REPARSE_TAG_SYMLINK;
+ } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) &&
+ server->ops->query_reparse_tag) {
+ tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path,
+ &reparse_tag);
+ if (tmprc)
+ cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc);
+ if (server->ops->query_symlink) {
+ tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
+ &data->symlink_target,
+ is_reparse_point);
+ if (tmprc)
+ cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__,
+ tmprc);
+ }
}
- cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
- is_reparse_point, reparse_tag);
+ cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1014,18 +1052,20 @@ cifs_get_inode_info(struct inode **inode,
*/
if (backup_cred(cifs_sb) && is_smb1_server(server)) {
/* for easier reading */
+ FILE_ALL_INFO *fi;
FILE_DIRECTORY_INFO *fdi;
SEARCH_ID_FULL_DIR_INFO *si;
rc = cifs_backup_query_path_info(xid, tcon, sb,
full_path,
&smb1_backup_rsp_buf,
- &data);
+ &fi);
if (rc)
goto out;
- fdi = (FILE_DIRECTORY_INFO *)data;
- si = (SEARCH_ID_FULL_DIR_INFO *)data;
+ move_cifs_info_to_smb2(&data->fi, fi);
+ fdi = (FILE_DIRECTORY_INFO *)fi;
+ si = (SEARCH_ID_FULL_DIR_INFO *)fi;
cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb);
fattr.cf_uniqueid = le64_to_cpu(si->UniqueId);
@@ -1123,7 +1163,8 @@ handle_mnt_opt:
out:
cifs_buf_release(smb1_backup_rsp_buf);
cifs_put_tlink(tlink);
- kfree(tmp_data);
+ cifs_free_open_info(&tmp_data);
+ kfree(fattr.cf_symlink_target);
return rc;
}
@@ -1138,7 +1179,7 @@ smb311_posix_get_inode_info(struct inode **inode,
bool adjust_tz = false;
struct cifs_fattr fattr = {0};
bool symlink = false;
- struct smb311_posix_qinfo *data = NULL;
+ struct cifs_open_info_data data = {};
int rc = 0;
int tmprc = 0;
@@ -1155,15 +1196,9 @@ smb311_posix_get_inode_info(struct inode **inode,
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto out;
}
- data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL);
- if (!data) {
- rc = -ENOMEM;
- goto out;
- }
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, data,
- &adjust_tz, &symlink);
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz,
+ &symlink);
/*
* 2. Convert it to internal cifs metadata (fattr)
@@ -1171,7 +1206,7 @@ smb311_posix_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1228,7 +1263,8 @@ smb311_posix_get_inode_info(struct inode **inode,
}
out:
cifs_put_tlink(tlink);
- kfree(data);
+ cifs_free_open_info(&data);
+ kfree(fattr.cf_symlink_target);
return rc;
}
@@ -2265,13 +2301,13 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return true;
if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) {
- mutex_lock(&cfid->fid_mutex);
+ spin_lock(&cfid->fid_lock);
if (cfid->time && cifs_i->time > cfid->time) {
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfid->fid_lock);
close_cached_dir(cfid);
return false;
}
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfid->fid_lock);
close_cached_dir(cfid);
}
/*
@@ -2327,7 +2363,7 @@ cifs_invalidate_mapping(struct inode *inode)
static int
cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
@@ -2345,7 +2381,7 @@ cifs_revalidate_mapping(struct inode *inode)
return 0;
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (rc)
return rc;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index b6e6e5d6c8dd..89d5fa887364 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(tlink);
if (tcon && tcon->ses->server->ops->notify) {
rc = tcon->ses->server->ops->notify(xid,
- filep, (void __user *)arg);
+ filep, (void __user *)arg,
+ false /* no ret data */);
cifs_dbg(FYI, "ioctl notify rc %d\n", rc);
} else
rc = -EOPNOTSUPP;
cifs_put_tlink(tlink);
break;
+ case CIFS_IOC_NOTIFY_INFO:
+ if (!S_ISDIR(inode->i_mode)) {
+ /* Notify can only be done on directories */
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
+ if (tcon && tcon->ses->server->ops->notify) {
+ rc = tcon->ses->server->ops->notify(xid,
+ filep, (void __user *)arg,
+ true /* return details */);
+ cifs_dbg(FYI, "ioctl notify info rc %d\n", rc);
+ } else
+ rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
+ break;
case CIFS_IOC_SHUTDOWN:
rc = cifs_shutdown(inode->i_sb, arg);
break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 6803cb27eecc..bd374feeccaa 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -38,29 +38,28 @@ static int
symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
{
int rc;
- struct crypto_shash *md5 = NULL;
- struct sdesc *sdescmd5 = NULL;
+ struct shash_desc *md5 = NULL;
- rc = cifs_alloc_hash("md5", &md5, &sdescmd5);
+ rc = cifs_alloc_hash("md5", &md5);
if (rc)
goto symlink_hash_err;
- rc = crypto_shash_init(&sdescmd5->shash);
+ rc = crypto_shash_init(md5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
goto symlink_hash_err;
}
- rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+ rc = crypto_shash_update(md5, link_str, link_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
goto symlink_hash_err;
}
- rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+ rc = crypto_shash_final(md5, md5_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
symlink_hash_err:
- cifs_free_hash(&md5, &sdescmd5);
+ cifs_free_hash(&md5);
return rc;
}
@@ -202,40 +201,6 @@ out:
return rc;
}
-static int
-query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const unsigned char *path,
- char **symlinkinfo)
-{
- int rc;
- u8 *buf = NULL;
- unsigned int link_len = 0;
- unsigned int bytes_read = 0;
-
- buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- if (tcon->ses->server->ops->query_mf_symlink)
- rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
- cifs_sb, path, buf, &bytes_read);
- else
- rc = -ENOSYS;
-
- if (rc)
- goto out;
-
- if (bytes_read == 0) { /* not a symlink */
- rc = -EINVAL;
- goto out;
- }
-
- rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
-out:
- kfree(buf);
- return rc;
-}
-
int
check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -245,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
u8 *buf = NULL;
unsigned int link_len = 0;
unsigned int bytes_read = 0;
+ char *symlink = NULL;
if (!couldbe_mf_symlink(fattr))
/* it's not a symlink */
@@ -266,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
if (bytes_read == 0) /* not a symlink */
goto out;
- rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
+ rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink);
if (rc == -EINVAL) {
/* it's not a symlink */
rc = 0;
@@ -281,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
fattr->cf_mode &= ~S_IFMT;
fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
fattr->cf_dtype = DT_LNK;
+ fattr->cf_symlink_target = symlink;
out:
kfree(buf);
return rc;
@@ -600,75 +567,6 @@ cifs_hl_exit:
return rc;
}
-const char *
-cifs_get_link(struct dentry *direntry, struct inode *inode,
- struct delayed_call *done)
-{
- int rc = -ENOMEM;
- unsigned int xid;
- const char *full_path;
- void *page;
- char *target_path = NULL;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct tcon_link *tlink = NULL;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
-
- if (!direntry)
- return ERR_PTR(-ECHILD);
-
- xid = get_xid();
-
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink)) {
- free_xid(xid);
- return ERR_CAST(tlink);
- }
- tcon = tlink_tcon(tlink);
- server = tcon->ses->server;
-
- page = alloc_dentry_path();
- full_path = build_path_from_dentry(direntry, page);
- if (IS_ERR(full_path)) {
- free_xid(xid);
- cifs_put_tlink(tlink);
- free_dentry_path(page);
- return ERR_CAST(full_path);
- }
-
- cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
-
- rc = -EACCES;
- /*
- * First try Minshall+French Symlinks, if configured
- * and fallback to UNIX Extensions Symlinks.
- */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
- &target_path);
-
- if (rc != 0 && server->ops->query_symlink) {
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
- bool reparse_point = false;
-
- if (cifsi->cifsAttrs & ATTR_REPARSE)
- reparse_point = true;
-
- rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
- &target_path, reparse_point);
- }
-
- free_dentry_path(page);
- free_xid(xid);
- cifs_put_tlink(tlink);
- if (rc != 0) {
- kfree(target_path);
- return ERR_PTR(rc);
- }
- set_delayed_call(done, kfree_link, target_path);
- return target_path;
-}
-
int
cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
struct dentry *direntry, const char *symname)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 87f60f736731..da51ffd02928 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -117,8 +117,8 @@ tconInfoAlloc(void)
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->cfid = init_cached_dir();
- if (!ret_buf->cfid) {
+ ret_buf->cfids = init_cached_dirs();
+ if (!ret_buf->cfids) {
kfree(ret_buf);
return NULL;
}
@@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon)
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
- free_cached_dir(tcon);
+ free_cached_dirs(tcon->cfids);
atomic_dec(&tconInfoAllocCount);
kfree(tcon->nativeFileSystem);
kfree_sensitive(tcon->password);
@@ -525,7 +525,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
cifs_sb->mnt_cifs_serverino_autodisabled = true;
cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n",
- tcon ? tcon->treeName : "new server");
+ tcon ? tcon->tree_name : "new server");
cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n");
cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n");
@@ -824,7 +824,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
free_dentry_path(page);
}
-/* parses DFS refferal V3 structure
+/* parses DFS referral V3 structure
* caller is responsible for freeing target_nodes
* returns:
* - on success - 0
@@ -1071,59 +1071,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
/**
* cifs_alloc_hash - allocate hash and hash context together
* @name: The name of the crypto hash algo
- * @shash: Where to put the pointer to the hash algo
- * @sdesc: Where to put the pointer to the hash descriptor
+ * @sdesc: SHASH descriptor where to put the pointer to the hash TFM
*
* The caller has to make sure @sdesc is initialized to either NULL or
- * a valid context. Both can be freed via cifs_free_hash().
+ * a valid context. It can be freed via cifs_free_hash().
*/
int
-cifs_alloc_hash(const char *name,
- struct crypto_shash **shash, struct sdesc **sdesc)
+cifs_alloc_hash(const char *name, struct shash_desc **sdesc)
{
int rc = 0;
- size_t size;
+ struct crypto_shash *alg = NULL;
- if (*sdesc != NULL)
+ if (*sdesc)
return 0;
- *shash = crypto_alloc_shash(name, 0, 0);
- if (IS_ERR(*shash)) {
- cifs_dbg(VFS, "Could not allocate crypto %s\n", name);
- rc = PTR_ERR(*shash);
- *shash = NULL;
+ alg = crypto_alloc_shash(name, 0, 0);
+ if (IS_ERR(alg)) {
+ cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name);
+ rc = PTR_ERR(alg);
*sdesc = NULL;
return rc;
}
- size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash);
- *sdesc = kmalloc(size, GFP_KERNEL);
+ *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL);
if (*sdesc == NULL) {
- cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name);
- crypto_free_shash(*shash);
- *shash = NULL;
+ cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name);
+ crypto_free_shash(alg);
return -ENOMEM;
}
- (*sdesc)->shash.tfm = *shash;
+ (*sdesc)->tfm = alg;
return 0;
}
/**
* cifs_free_hash - free hash and hash context together
- * @shash: Where to find the pointer to the hash algo
- * @sdesc: Where to find the pointer to the hash descriptor
+ * @sdesc: Where to find the pointer to the hash TFM
*
- * Freeing a NULL hash or context is safe.
+ * Freeing a NULL descriptor is safe.
*/
void
-cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
+cifs_free_hash(struct shash_desc **sdesc)
{
- kfree(*sdesc);
+ if (unlikely(!sdesc) || !*sdesc)
+ return;
+
+ if ((*sdesc)->tfm) {
+ crypto_free_shash((*sdesc)->tfm);
+ (*sdesc)->tfm = NULL;
+ }
+
+ kfree_sensitive(*sdesc);
*sdesc = NULL;
- if (*shash)
- crypto_free_shash(*shash);
- *shash = NULL;
}
/**
@@ -1328,7 +1327,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
char *treename, *dfspath, sep;
int treenamelen, linkpathlen, rc;
- treename = tcon->treeName;
+ treename = tcon->tree_name;
/* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL
* messages MUST be encoded with exactly one leading backslash, not two
* leading backslashes.
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8e060c00c969..2d75ba5aaa8a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde,
struct dir_context *ctx)
{
struct cached_dirent *dirent;
- int rc;
+ bool rc;
list_for_each_entry(dirent, &cde->entries, entry) {
- if (ctx->pos >= dirent->pos)
+ /*
+ * Skip all early entries prior to the current lseek()
+ * position.
+ */
+ if (ctx->pos > dirent->pos)
continue;
+ /*
+ * We recorded the current ->pos value for the dirent
+ * when we stored it in the cache.
+ * However, this sequence of ->pos values may have holes
+ * in it, for example dot-dirs returned from the server
+ * are suppressed.
+ * Handle this bu forcing ctx->pos to be the same as the
+ * ->pos of the current dirent we emit from the cache.
+ * This means that when we emit these entries from the cache
+ * we now emit them with the same ->pos value as in the
+ * initial scan.
+ */
ctx->pos = dirent->pos;
rc = dir_emit(ctx, dirent->name, dirent->namelen,
dirent->fattr.cf_uniqueid,
dirent->fattr.cf_dtype);
if (!rc)
return rc;
+ ctx->pos++;
}
return true;
}
@@ -994,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_unix_basic_to_fattr(&fattr,
&((FILE_UNIX_INFO *)find_entry)->basic,
cifs_sb);
+ if (S_ISLNK(fattr.cf_mode))
+ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
break;
case SMB_FIND_FILE_INFO_STANDARD:
cifs_std_info_to_fattr(&fattr,
@@ -1202,10 +1221,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos, tmp_buf);
cifs_save_resume_key(current_entry, cifsFile);
break;
- } else
- current_entry =
- nxt_dir_entry(current_entry, end_of_smb,
- cifsFile->srch_inf.info_level);
+ }
+ current_entry =
+ nxt_dir_entry(current_entry, end_of_smb,
+ cifsFile->srch_inf.info_level);
}
kfree(tmp_buf);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3af3b05b6c74..92e4278ec35d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -496,6 +496,7 @@ out:
cifs_put_tcp_session(chan->server, 0);
}
+ free_xid(xid);
return rc;
}
@@ -601,11 +602,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* BB FIXME add check that strings total less
than 335 or will need to send them as arrays */
- /* unicode strings, must be word aligned before the call */
-/* if ((long) bcc_ptr % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- } */
/* copy user */
if (ses->user_name == NULL) {
/* null user mount */
@@ -1213,10 +1209,18 @@ out_free_smb_buf:
static void
sess_free_buffer(struct sess_data *sess_data)
{
+ struct kvec *iov = sess_data->iov;
+
+ /*
+ * Zero the session data before freeing, as it might contain sensitive info (keys, etc).
+ * Note that iov[1] is already freed by caller.
+ */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base)
+ memzero_explicit(iov[0].iov_base, iov[0].iov_len);
- free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ free_rsp_buf(sess_data->buf0_type, iov[0].iov_base);
sess_data->buf0_type = CIFS_NO_BUFFER;
- kfree(sess_data->iov[2].iov_base);
+ kfree_sensitive(iov[2].iov_base);
}
static int
@@ -1318,7 +1322,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
}
if (ses->capabilities & CAP_UNICODE) {
- if (sess_data->iov[0].iov_len % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1358,7 +1362,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1374,7 +1378,7 @@ out:
sess_data->result = rc;
sess_data->func = NULL;
sess_free_buffer(sess_data);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
}
@@ -1442,8 +1446,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
if (ses->capabilities & CAP_UNICODE) {
/* unicode strings must be word aligned */
- if ((sess_data->iov[0].iov_len
- + sess_data->iov[1].iov_len) % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1494,7 +1497,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1513,7 +1516,7 @@ out:
sess_data->result = rc;
sess_data->func = NULL;
sess_free_buffer(sess_data);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
}
@@ -1546,7 +1549,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
bcc_ptr = sess_data->iov[2].iov_base;
/* unicode strings must be word aligned */
- if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1648,7 +1651,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
out_free_ntlmsspblob:
- kfree(ntlmsspblob);
+ kfree_sensitive(ntlmsspblob);
out:
sess_free_buffer(sess_data);
@@ -1658,9 +1661,9 @@ out:
}
/* Else error. Cleanup */
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->func = NULL;
@@ -1747,7 +1750,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1759,7 +1762,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
}
out_free_ntlmsspblob:
- kfree(ntlmsspblob);
+ kfree_sensitive(ntlmsspblob);
out:
sess_free_buffer(sess_data);
@@ -1767,9 +1770,9 @@ out:
rc = sess_establish_session(sess_data);
/* Cleanup */
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->func = NULL;
@@ -1845,7 +1848,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
rc = sess_data->result;
out:
- kfree(sess_data);
+ kfree_sensitive(sess_data);
return rc;
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index f36b2d2d40ca..50480751e521 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
+static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink)
{
int rc;
+ FILE_ALL_INFO fi = {};
*symlink = false;
/* could do find first instead but this returns more info */
- rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
+ rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
/*
* BB optimize code so we do not make the above call when server claims
* no NT SMB support and the above call failed at least once - set flag
* in tcon or mount.
*/
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
- rc = SMBQueryInformation(xid, tcon, full_path, data,
- cifs_sb->local_nls,
+ rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
+ if (!rc)
+ move_cifs_info_to_smb2(&data->fi, &fi);
*adjustTZ = true;
}
- if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+ if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) {
int tmprc;
int oplock = 0;
struct cifs_fid fid;
@@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- u64 *uniqueid, FILE_ALL_INFO *data)
+static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ u64 *uniqueid, struct cifs_open_info_data *unused)
{
/*
* We can not use the IndexNumber field by default from Windows or
@@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
cifs_remap(cifs_sb));
}
-static int
-cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_fid *fid, FILE_ALL_INFO *data)
+static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data)
{
- return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
+ int rc;
+ FILE_ALL_INFO fi = {};
+
+ if (cfile->symlink_target) {
+ data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ }
+
+ rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi);
+ if (!rc)
+ move_cifs_info_to_smb2(&data->fi, &fi);
+ return rc;
}
static void
@@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
cifsInode->cifsAttrs = dosattrs;
}
-static int
-cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf)
+static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf)
{
+ FILE_ALL_INFO *fi = buf;
+
if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
oparms->disposition,
oparms->desired_access,
oparms->create_options,
- &oparms->fid->netfid, oplock, buf,
+ &oparms->fid->netfid, oplock, fi,
oparms->cifs_sb->local_nls,
cifs_remap(oparms->cifs_sb));
- return CIFS_open(xid, oparms, oplock, buf);
+ return CIFS_open(xid, oparms, oplock, fi);
}
static void
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 9dfd2dd612c2..ffbd9a99fc12 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -20,40 +20,125 @@
#include "cifs_unicode.h"
#include "fscache.h"
#include "smb2proto.h"
+#include "smb2status.h"
-int
-smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf)
+static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
+{
+ struct smb2_err_rsp *err = iov->iov_base;
+ struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL);
+ u32 len;
+
+ if (err->ErrorContextCount) {
+ struct smb2_error_context_rsp *p, *end;
+
+ len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp,
+ ErrorContextData) +
+ sizeof(struct smb2_symlink_err_rsp));
+ if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err))
+ return ERR_PTR(-EINVAL);
+
+ p = (struct smb2_error_context_rsp *)err->ErrorData;
+ end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len);
+ do {
+ if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) {
+ sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData;
+ break;
+ }
+ cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n",
+ __func__, le32_to_cpu(p->ErrorId));
+
+ len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8);
+ p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len);
+ } while (p < end);
+ } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) &&
+ iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) {
+ sym = (struct smb2_symlink_err_rsp *)err->ErrorData;
+ }
+
+ if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
+ le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK))
+ sym = ERR_PTR(-EINVAL);
+
+ return sym;
+}
+
+int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path)
+{
+ struct smb2_symlink_err_rsp *sym;
+ unsigned int sub_offs, sub_len;
+ unsigned int print_offs, print_len;
+ char *s;
+
+ if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path)
+ return -EINVAL;
+
+ sym = symlink_data(iov);
+ if (IS_ERR(sym))
+ return PTR_ERR(sym);
+
+ sub_len = le16_to_cpu(sym->SubstituteNameLength);
+ sub_offs = le16_to_cpu(sym->SubstituteNameOffset);
+ print_len = le16_to_cpu(sym->PrintNameLength);
+ print_offs = le16_to_cpu(sym->PrintNameOffset);
+
+ if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len ||
+ iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len)
+ return -EINVAL;
+
+ s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true,
+ cifs_sb->local_nls);
+ if (!s)
+ return -ENOMEM;
+ convert_delimiter(s, '/');
+ cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s);
+
+ *path = s;
+ return 0;
+}
+
+int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf)
{
int rc;
__le16 *smb2_path;
- struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock;
+ struct cifs_open_info_data *data = buf;
+ struct smb2_file_all_info file_info = {};
+ struct smb2_file_all_info *smb2_data = data ? &file_info : NULL;
+ struct kvec err_iov = {};
+ int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
- if (smb2_path == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL) {
- rc = -ENOMEM;
- goto out;
- }
+ if (smb2_path == NULL)
+ return -ENOMEM;
oparms->desired_access |= FILE_READ_ATTRIBUTES;
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
- rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL,
- NULL, NULL);
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ if (rc && data) {
+ struct smb2_hdr *hdr = err_iov.iov_base;
+
+ if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER))
+ rc = -ENOMEM;
+ else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov,
+ &data->symlink_target);
+ if (!rc) {
+ memset(smb2_data, 0, sizeof(*smb2_data));
+ oparms->create_options |= OPEN_REPARSE_POINT;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data,
+ NULL, NULL, NULL);
+ oparms->create_options &= ~OPEN_REPARSE_POINT;
+ }
+ }
+ }
+
if (rc)
goto out;
-
if (oparms->tcon->use_resilient) {
/* default timeout is 0, servers pick default (120 seconds) */
nr_ioctl_req.Timeout =
@@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
rc = 0;
}
- if (buf) {
+ if (smb2_data) {
/* if open response does not have IndexNumber field - get it */
if (smb2_data->IndexNumber == 0) {
rc = SMB2_get_srv_num(xid, oparms->tcon,
@@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
rc = 0;
}
}
- move_smb2_info_to_cifs(buf, smb2_data);
+ memcpy(&data->fi, smb2_data, sizeof(data->fi));
}
*oplock = smb2_oplock;
out:
- kfree(smb2_data);
+ free_rsp_buf(err_buftype, err_iov.iov_base);
kfree(smb2_path);
return rc;
}
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index b83f59051b26..68e08c85fbb8 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -24,6 +24,7 @@
#include "smb2pdu.h"
#include "smb2proto.h"
#include "cached_dir.h"
+#include "smb2status.h"
static void
free_set_inf_compound(struct smb_rqst *rqst)
@@ -50,13 +51,15 @@ struct cop_vars {
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
+ *
+ * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all
+ * error responses. Caller is also responsible for freeing them up.
*/
-static int
-smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition,
- __u32 create_options, umode_t mode, void *ptr, int command,
- struct cifsFileInfo *cfile)
+static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ __u32 desired_access, __u32 create_disposition, __u32 create_options,
+ umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
+ struct kvec *err_iov, int *err_buftype)
{
struct cop_vars *vars = NULL;
struct kvec *rsp_iov;
@@ -70,6 +73,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
int num_rqst = 0;
int resp_buftype[3];
struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct cifs_open_info_data *idata;
int flags = 0;
__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
unsigned int size[2];
@@ -379,20 +383,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_open_free(&rqst[0]);
if (rc == -EREMCHG) {
- pr_warn_once("server share %s deleted\n", tcon->treeName);
+ pr_warn_once("server share %s deleted\n", tcon->tree_name);
tcon->need_reconnect = true;
}
switch (command) {
case SMB2_OP_QUERY_INFO:
+ idata = ptr;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
if (rc == 0) {
qi_rsp = (struct smb2_query_info_rsp *)
rsp_iov[1].iov_base;
rc = smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(struct smb2_file_all_info),
- ptr);
+ &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi);
}
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
@@ -406,13 +415,20 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
tcon->tid);
break;
case SMB2_OP_POSIX_QUERY_INFO:
+ idata = ptr;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
if (rc == 0) {
qi_rsp = (struct smb2_query_info_rsp *)
rsp_iov[1].iov_base;
rc = smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr);
+ &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
+ (char *)&idata->posix_fi);
}
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
@@ -477,42 +493,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
free_set_inf_compound(rqst);
break;
}
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+
+ if (rc && err_iov && err_buftype) {
+ memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov));
+ memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype));
+ } else {
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ }
kfree(vars);
return rc;
}
-void
-move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
-{
- memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src);
- dst->CurrentByteOffset = src->CurrentByteOffset;
- dst->Mode = src->Mode;
- dst->AlignmentRequirement = src->AlignmentRequirement;
- dst->IndexNumber1 = 0; /* we don't use it */
-}
-
-int
-smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse)
+int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
{
int rc;
- struct smb2_file_all_info *smb2_data;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
+ struct kvec err_iov[3] = {};
+ int err_buftype[3] = {};
*adjust_tz = false;
*reparse = false;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
-
if (strcmp(full_path, ""))
rc = -ENOENT;
else
@@ -520,63 +527,58 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* If it is a root and its handle is cached then use it */
if (!rc) {
if (cfid->file_all_info_is_valid) {
- move_smb2_info_to_cifs(data,
- &cfid->file_all_info);
+ memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
} else {
- rc = SMB2_query_info(xid, tcon,
- cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, smb2_data);
- if (!rc)
- move_smb2_info_to_cifs(data, smb2_data);
+ rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid, &data->fi);
}
close_cached_dir(cfid);
- goto out;
+ return rc;
}
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
- ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
+ err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
+ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ if (rc)
+ goto out;
+ }
*reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, create_options, ACL_NO_MODE, data,
+ SMB2_OP_QUERY_INFO, cfile, NULL, NULL);
}
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
out:
- kfree(smb2_data);
+ free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
+ free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
+ free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
return rc;
}
-int
-smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse)
+int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
{
int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
- struct smb311_posix_qinfo *smb2_data;
+ struct kvec err_iov[3] = {};
+ int err_buftype[3] = {};
*adjust_tz = false;
*reparse = false;
- /* BB TODO: Make struct larger when add support for parsing owner SIDs */
- smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo),
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
-
/*
* BB TODO: Add support for using the cached root handle.
* Create SMB2_query_posix_info worker function to do non-compounded query
@@ -585,29 +587,32 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
*/
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
- ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
+ err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
/* BB TODO: When support for special files added to Samba re-verify this path */
+ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ if (rc)
+ goto out;
+ }
*reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, create_options, ACL_NO_MODE, data,
+ SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL);
}
- if (rc)
- goto out;
-
- /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */
- memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo));
out:
- kfree(smb2_data);
+ free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
+ free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
+ free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
return rc;
}
@@ -619,7 +624,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL);
+ NULL, NULL, NULL);
}
void
@@ -641,7 +646,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile);
+ &data, SMB2_OP_SET_INFO, cfile, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -650,9 +655,10 @@ int
smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
+ drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL);
+ NULL, SMB2_OP_RMDIR, NULL, NULL, NULL);
}
int
@@ -661,7 +667,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL);
+ ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL);
}
static int
@@ -680,7 +686,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile);
+ command, cfile, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -693,6 +699,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifsFileInfo *cfile;
+ drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
return smb2_set_path_attr(xid, tcon, from_name, to_name,
@@ -720,7 +727,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile);
+ &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL);
}
int
@@ -746,7 +753,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile);
+ 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
+ NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index d73e5672aac4..a38720477966 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
- if (((calc_len + 7) & ~7) == len)
+ if (ALIGN(calc_len, 8) == len)
return 0;
/*
@@ -870,8 +870,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct kvec *iov, int nvec)
{
int i, rc;
- struct sdesc *d;
struct smb2_hdr *hdr;
+ struct shash_desc *sha512 = NULL;
hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
@@ -901,14 +901,14 @@ ok:
if (rc)
return rc;
- d = server->secmech.sdescsha512;
- rc = crypto_shash_init(&d->shash);
+ sha512 = server->secmech.sha512;
+ rc = crypto_shash_init(sha512);
if (rc) {
cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__);
return rc;
}
- rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash,
+ rc = crypto_shash_update(sha512, ses->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__);
@@ -916,8 +916,7 @@ ok:
}
for (i = 0; i < nvec; i++) {
- rc = crypto_shash_update(&d->shash,
- iov[i].iov_base, iov[i].iov_len);
+ rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update sha512 shash\n",
__func__);
@@ -925,7 +924,7 @@ ok:
}
}
- rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash);
+ rc = crypto_shash_final(sha512, ses->preauth_sha_hash);
if (rc) {
cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n",
__func__);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 421be43af425..4f53fa012936 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
- size_t buf_len,
- struct cifs_ses *ses)
+ size_t buf_len, struct cifs_ses *ses, bool in_mount)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
@@ -531,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
spin_lock(&ses->iface_lock);
+ ses->iface_count = 0;
/*
* Go through iface_list and do kref_put to remove
* any unused ifaces. ifaces in use will be removed
@@ -543,6 +543,21 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
}
spin_unlock(&ses->iface_lock);
+ /*
+ * Samba server e.g. can return an empty interface list in some cases,
+ * which would only be a problem if we were requesting multichannel
+ */
+ if (bytes_left == 0) {
+ /* avoid spamming logs every 10 minutes, so log only in mount */
+ if ((ses->chan_max > 1) && in_mount)
+ cifs_dbg(VFS,
+ "multichannel not available\n"
+ "Empty network interface list returned by server %s\n",
+ ses->server->hostname);
+ rc = -EINVAL;
+ goto out;
+ }
+
while (bytes_left >= sizeof(*p)) {
memset(&tmp_iface, 0, sizeof(tmp_iface));
tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
@@ -637,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
kref_put(&iface->refcount, release_iface);
} else
list_add_tail(&info->iface_head, &ses->iface_list);
- spin_unlock(&ses->iface_lock);
ses->iface_count++;
+ spin_unlock(&ses->iface_lock);
ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
@@ -673,7 +688,7 @@ out:
}
int
-SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount)
{
int rc;
unsigned int ret_data_len = 0;
@@ -693,7 +708,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len, ses);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount);
if (rc)
goto out;
@@ -729,7 +744,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return;
- SMB3_request_interfaces(xid, tcon);
+ SMB3_request_interfaces(xid, tcon, true /* called during mount */);
SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
FS_ATTRIBUTE_INFORMATION);
@@ -787,7 +802,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
if (!rc) {
- if (cfid->is_valid) {
+ if (cfid->has_lease) {
close_cached_dir(cfid);
return 0;
}
@@ -817,33 +832,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- u64 *uniqueid, FILE_ALL_INFO *data)
+static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ u64 *uniqueid, struct cifs_open_info_data *data)
{
- *uniqueid = le64_to_cpu(data->IndexNumber);
+ *uniqueid = le64_to_cpu(data->fi.IndexNumber);
return 0;
}
-static int
-smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_fid *fid, FILE_ALL_INFO *data)
+static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data)
{
- int rc;
- struct smb2_file_all_info *smb2_data;
-
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
+ struct cifs_fid *fid = &cfile->fid;
- rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid,
- smb2_data);
- if (!rc)
- move_smb2_info_to_cifs(data, smb2_data);
- kfree(smb2_data);
- return rc;
+ if (cfile->symlink_target) {
+ data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ }
+ return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi);
}
#ifdef CONFIG_CIFS_XATTR
@@ -1327,7 +1334,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
CIFSMaxBufSize, (char **)&res_key, &ret_data_len);
if (rc == -EOPNOTSUPP) {
- pr_warn_once("Server share %s does not support copy range\n", tcon->treeName);
+ pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name);
goto req_res_key_exit;
} else if (rc) {
cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
@@ -2012,9 +2019,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
static int
smb3_notify(const unsigned int xid, struct file *pfile,
- void __user *ioc_buf)
+ void __user *ioc_buf, bool return_changes)
{
- struct smb3_notify notify;
+ struct smb3_notify_info notify;
+ struct smb3_notify_info __user *pnotify_buf;
struct dentry *dentry = pfile->f_path.dentry;
struct inode *inode = file_inode(pfile);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -2022,10 +2030,12 @@ smb3_notify(const unsigned int xid, struct file *pfile,
struct cifs_fid fid;
struct cifs_tcon *tcon;
const unsigned char *path;
+ char *returned_ioctl_info = NULL;
void *page = alloc_dentry_path();
__le16 *utf16_path = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
int rc = 0;
+ __u32 ret_len = 0;
path = build_path_from_dentry(dentry, page);
if (IS_ERR(path)) {
@@ -2039,9 +2049,17 @@ smb3_notify(const unsigned int xid, struct file *pfile,
goto notify_exit;
}
- if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify))) {
- rc = -EFAULT;
- goto notify_exit;
+ if (return_changes) {
+ if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify_info))) {
+ rc = -EFAULT;
+ goto notify_exit;
+ }
+ } else {
+ if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify))) {
+ rc = -EFAULT;
+ goto notify_exit;
+ }
+ notify.data_len = 0;
}
tcon = cifs_sb_master_tcon(cifs_sb);
@@ -2058,12 +2076,22 @@ smb3_notify(const unsigned int xid, struct file *pfile,
goto notify_exit;
rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid,
- notify.watch_tree, notify.completion_filter);
+ notify.watch_tree, notify.completion_filter,
+ notify.data_len, &returned_ioctl_info, &ret_len);
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc);
-
+ if (return_changes && (ret_len > 0) && (notify.data_len > 0)) {
+ if (ret_len > notify.data_len)
+ ret_len = notify.data_len;
+ pnotify_buf = (struct smb3_notify_info __user *)ioc_buf;
+ if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len))
+ rc = -EFAULT;
+ else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len)))
+ rc = -EFAULT;
+ }
+ kfree(returned_ioctl_info);
notify_exit:
free_dentry_path(page);
kfree(utf16_path);
@@ -2289,7 +2317,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
- tcon->treeName);
+ tcon->tree_name);
return;
}
}
@@ -2498,7 +2526,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
if (rc == -EREMCHG) {
tcon->need_reconnect = true;
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
}
goto qic_exit;
}
@@ -2814,9 +2842,6 @@ parse_reparse_point(struct reparse_data_buffer *buf,
}
}
-#define SMB2_SYMLINK_STRUCT_SIZE \
- (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
-
static int
smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -2828,13 +2853,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
struct kvec err_iov = {NULL, 0};
- struct smb2_err_rsp *err_buf = NULL;
- struct smb2_symlink_err_rsp *symlink;
struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- unsigned int sub_len;
- unsigned int sub_offset;
- unsigned int print_len;
- unsigned int print_offset;
int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -2951,47 +2970,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
goto querty_exit;
}
- err_buf = err_iov.iov_base;
- if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
- err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
- if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
- le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- /* open must fail on symlink - reset rc */
- rc = 0;
- sub_len = le16_to_cpu(symlink->SubstituteNameLength);
- sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
- print_len = le16_to_cpu(symlink->PrintNameLength);
- print_offset = le16_to_cpu(symlink->PrintNameOffset);
-
- if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- if (err_iov.iov_len <
- SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- *target_path = cifs_strndup_from_utf16(
- (char *)symlink->PathBuffer + sub_offset,
- sub_len, true, cifs_sb->local_nls);
- if (!(*target_path)) {
- rc = -ENOMEM;
- goto querty_exit;
- }
- convert_delimiter(*target_path, '/');
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+ rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path);
querty_exit:
cifs_dbg(FYI, "query symlink rc %d\n", rc);
@@ -4344,8 +4323,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- tfm = enc ? server->secmech.ccmaesencrypt :
- server->secmech.ccmaesdecrypt;
+ tfm = enc ? server->secmech.enc : server->secmech.dec;
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
@@ -4410,11 +4388,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree(iv);
+ kfree_sensitive(iv);
free_sg:
- kfree(sg);
+ kfree_sensitive(sg);
free_req:
- kfree(req);
+ kfree_sensitive(req);
return rc;
}
@@ -5102,7 +5080,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
int rc = -EPERM;
- FILE_ALL_INFO *buf = NULL;
+ struct cifs_open_info_data buf = {};
struct cifs_io_parms io_parms = {0};
__u32 oplock = 0;
struct cifs_fid fid;
@@ -5118,7 +5096,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
* and was used by default in earlier versions of Windows
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto out;
+ return rc;
/*
* TODO: Add ability to create instead via reparse point. Windows (e.g.
@@ -5127,16 +5105,10 @@ smb2_make_node(unsigned int xid, struct inode *inode,
*/
if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto out;
+ return rc;
cifs_dbg(FYI, "sfu compat create special file\n");
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
@@ -5151,21 +5123,21 @@ smb2_make_node(unsigned int xid, struct inode *inode,
oplock = REQ_OPLOCK;
else
oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
if (rc)
- goto out;
+ return rc;
/*
* BB Do not bother to decode buf since no local inode yet to put
* timestamps in, but we can reuse it safely.
*/
- pdev = (struct win_dev *)buf;
+ pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
+ iov[1].iov_base = &buf.fi;
iov[1].iov_len = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
@@ -5184,8 +5156,8 @@ smb2_make_node(unsigned int xid, struct inode *inode,
d_drop(dentry);
/* FIXME: add code here to set EAs */
-out:
- kfree(buf);
+
+ cifs_free_open_info(&buf);
return rc;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 6352ab32c7e7..a5695748a89b 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
/*
* Context Data length must be rounded to multiple of 8 for some servers
*/
- pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
- sizeof(struct smb2_signing_capabilities) -
- sizeof(struct smb2_neg_context) +
- (num_algs * 2 /* sizeof u16 */), 8) * 8);
+ pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) -
+ sizeof(struct smb2_neg_context) +
+ (num_algs * sizeof(u16)), 8));
pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
- ctxt_len += 2 /* sizeof le16 */ * num_algs;
- ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
+ ctxt_len += sizeof(__le16) * num_algs;
+ ctxt_len = ALIGN(ctxt_len, 8);
return ctxt_len;
/* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
}
@@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
/* copy up to max of first 100 bytes of server name to NetName field */
pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
/* context size is DataLength + minimal smb2_neg_context */
- return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
- sizeof(struct smb2_neg_context), 8) * 8;
+ return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8);
}
static void
@@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
* round up total_len of fixed part of SMB3 negotiate request to 8
* byte boundary before adding negotiate contexts
*/
- *total_len = roundup(*total_len, 8);
+ *total_len = ALIGN(*total_len, 8);
pneg_ctxt = (*total_len) + (char *)req;
req->NegotiateContextOffset = cpu_to_le32(*total_len);
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
@@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(
- sizeof(struct smb2_compression_capabilities_context),
- 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
neg_context_count++;
@@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
if (rc)
break;
/* offsets must be 8 byte aligned */
- clen = (clen + 7) & ~0x7;
+ clen = ALIGN(clen, 8);
offset += clen + sizeof(struct smb2_neg_context);
len_of_ctxts -= clen;
}
@@ -873,7 +869,7 @@ SMB2_negotiate(const unsigned int xid,
struct smb2_negotiate_rsp *rsp;
struct kvec iov[1];
struct kvec rsp_iov;
- int rc = 0;
+ int rc;
int resp_buftype;
int blob_offset, blob_length;
char *security_blob;
@@ -1169,9 +1165,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->Dialects[0] =
cpu_to_le16(server->vals->protocol_id);
pneg_inbuf->DialectCount = cpu_to_le16(1);
- /* structure is big enough for 3 dialects, sending only 1 */
+ /* structure is big enough for 4 dialects, sending only 1 */
inbuflen = sizeof(*pneg_inbuf) -
- sizeof(pneg_inbuf->Dialects[0]) * 2;
+ sizeof(pneg_inbuf->Dialects[0]) * 3;
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -1345,7 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
static void
SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data)
{
- free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ struct kvec *iov = sess_data->iov;
+
+ /* iov[1] is already freed by caller */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base)
+ memzero_explicit(iov[0].iov_base, iov[0].iov_len);
+
+ free_rsp_buf(sess_data->buf0_type, iov[0].iov_base);
sess_data->buf0_type = CIFS_NO_BUFFER;
}
@@ -1477,6 +1479,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
out_put_spnego_key:
key_invalidate(spnego_key);
key_put(spnego_key);
+ if (rc)
+ kfree_sensitive(ses->auth_key.response);
out:
sess_data->result = rc;
sess_data->func = NULL;
@@ -1526,7 +1530,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
&blob_length, ses, server,
sess_data->nls_cp);
if (rc)
- goto out_err;
+ goto out;
if (use_spnego) {
/* BB eventually need to add this */
@@ -1573,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
}
out:
- kfree(ntlmssp_blob);
+ kfree_sensitive(ntlmssp_blob);
SMB2_sess_free_buffer(sess_data);
if (!rc) {
sess_data->result = 0;
@@ -1581,7 +1585,7 @@ out:
return;
}
out_err:
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->result = rc;
sess_data->func = NULL;
@@ -1657,9 +1661,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
}
#endif
out:
- kfree(ntlmssp_blob);
+ kfree_sensitive(ntlmssp_blob);
SMB2_sess_free_buffer(sess_data);
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->result = rc;
sess_data->func = NULL;
@@ -1737,7 +1741,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(VFS, "signing requested but authenticated as guest\n");
rc = sess_data->result;
out:
- kfree(sess_data);
+ kfree_sensitive(sess_data);
return rc;
}
@@ -1930,7 +1934,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
- strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -1973,6 +1977,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if (!ses || !(ses->server))
return -EIO;
+ trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name);
spin_lock(&ses->chan_lock);
if ((tcon->need_reconnect) ||
(CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) {
@@ -2004,8 +2009,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rc = cifs_send_recv(xid, ses, ses->server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
- if (rc)
+ if (rc) {
cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
+ trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc);
+ }
+ trace_smb3_tdis_done(xid, tcon->tid, ses->Suid);
return rc;
}
@@ -2411,9 +2419,9 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
unsigned int acelen, acl_size, ace_count;
unsigned int owner_offset = 0;
unsigned int group_offset = 0;
- struct smb3_acl acl;
+ struct smb3_acl acl = {};
- *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
+ *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
if (set_owner) {
/* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
@@ -2484,10 +2492,11 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */
acl.AclSize = cpu_to_le16(acl_size);
acl.AceCount = cpu_to_le16(ace_count);
+ /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */
memcpy(aclptr, &acl, sizeof(struct smb3_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
- *len = roundup(ptr - (__u8 *)buf, 8);
+ *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
return buf;
}
@@ -2581,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
* final path needs to be 8-byte aligned as specified in
* MS-SMB2 2.2.13 SMB2 CREATE Request.
*/
- *out_size = roundup(*out_len * sizeof(__le16), 8);
+ *out_size = round_up(*out_len * sizeof(__le16), 8);
*out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
if (!*out_path)
return -ENOMEM;
@@ -2674,7 +2683,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
- tcon->treeName, utf16_path);
+ tcon->tree_name, utf16_path);
if (rc)
goto err_free_req;
@@ -2816,7 +2825,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
- tcon->treeName, path);
+ tcon->tree_name, path);
if (rc)
return rc;
req->NameLength = cpu_to_le16(name_len * 2);
@@ -2826,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
/* MUST set path len (NameLength) to 0 opening root of share */
req->NameLength = cpu_to_le16(uni_path_len - 2);
- copy_size = uni_path_len;
- if (copy_size % 8 != 0)
- copy_size = roundup(copy_size, 8);
+ copy_size = round_up(uni_path_len, 8);
copy_path = kzalloc(copy_size, GFP_KERNEL);
if (!copy_path)
return -ENOMEM;
@@ -3011,7 +3018,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
oparms->create_options, oparms->desired_access, rc);
if (rc == -EREMCHG) {
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
tcon->need_reconnect = true;
}
goto creat_exit;
@@ -3472,7 +3479,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
if (rc)
return rc;
- memcpy(data, begin_of_buf, buffer_length);
+ memcpy(data, begin_of_buf, minbufsize);
return 0;
}
@@ -3596,7 +3603,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp_iov, min_len, *data);
+ &rsp_iov, dlen ? *dlen : min_len, *data);
if (rc && allocated) {
kfree(*data);
*data = NULL;
@@ -3702,11 +3709,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
int
SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, bool watch_tree,
- u32 completion_filter)
+ u32 completion_filter, u32 max_out_data_len, char **out_data,
+ u32 *plen /* returned data len */)
{
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = cifs_pick_channel(ses);
struct smb_rqst rqst;
+ struct smb2_change_notify_rsp *smb_rsp;
struct kvec iov[1];
struct kvec rsp_iov = {NULL, 0};
int resp_buftype = CIFS_NO_BUFFER;
@@ -3722,6 +3731,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
memset(&rqst, 0, sizeof(struct smb_rqst));
memset(&iov, 0, sizeof(iov));
+ if (plen)
+ *plen = 0;
+
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
@@ -3740,9 +3752,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE);
trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid,
(u8)watch_tree, completion_filter, rc);
- } else
+ } else {
trace_smb3_notify_done(xid, persistent_fid, tcon->tid,
- ses->Suid, (u8)watch_tree, completion_filter);
+ ses->Suid, (u8)watch_tree, completion_filter);
+ /* validate that notify information is plausible */
+ if ((rsp_iov.iov_base == NULL) ||
+ (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp)))
+ goto cnotify_exit;
+
+ smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base;
+
+ smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset),
+ le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov,
+ sizeof(struct file_notify_information));
+
+ *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset),
+ le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL);
+ if (*out_data == NULL) {
+ rc = -ENOMEM;
+ goto cnotify_exit;
+ } else
+ *plen = le32_to_cpu(smb_rsp->OutputBufferLength);
+ }
cnotify_exit:
if (rqst.rq_iov)
@@ -4090,7 +4121,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (request_type & CHAINED_REQUEST) {
if (!(request_type & END_OF_CHAIN)) {
/* next 8-byte aligned request */
- *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
+ *total_len = ALIGN(*total_len, 8);
shdr->NextCommand = cpu_to_le32(*total_len);
} else /* END_OF_CHAIN */
shdr->NextCommand = 0;
@@ -4429,7 +4460,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->bytes, wdata->result);
if (wdata->result == -ENOSPC)
pr_warn_once("Out of space writing to %s\n",
- tcon->treeName);
+ tcon->tree_name);
} else
trace_smb3_write_done(0 /* no xid */,
wdata->cfile->fid.persistent_fid,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f57881b8464f..1237bb86e93a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform {
#define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL
+#define SMB2_SYMLINK_STRUCT_SIZE \
+ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
+
#define SYMLINK_ERROR_TAG 0x4c4d5953
struct smb2_symlink_err_rsp {
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 3f740f24b96a..be21b5d26f67 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer,
struct TCP_Server_Info *srv);
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-
-extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
- struct smb2_file_all_info *src);
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
-extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path, FILE_ALL_INFO *data,
- bool *adjust_tz, bool *symlink);
+int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc);
@@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_read);
-extern int smb2_open_file(const unsigned int xid,
- struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf);
+int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path);
+int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf);
extern int smb2_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
@@ -148,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon,
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, bool watch_tree,
- u32 completion_filter);
+ u32 completion_filter, u32 max_out_data_len,
+ char **out_data, u32 *plen /* returned data len */);
extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
@@ -278,9 +275,9 @@ extern int smb2_query_info_compound(const unsigned int xid,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
/* query path info from the server using SMB311 POSIX extensions*/
-extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf,
- bool *adjust_tx, bool *symlink);
+int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 1a5fc3314dbf..8e3f26e6f6b9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
struct cifs_secmech *p = &server->secmech;
int rc;
- rc = cifs_alloc_hash("hmac(sha256)",
- &p->hmacsha256,
- &p->sdeschmacsha256);
+ rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
if (rc)
goto err;
- rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
if (rc)
goto err;
return 0;
err:
- cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ cifs_free_hash(&p->hmacsha256);
return rc;
}
@@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
struct cifs_secmech *p = &server->secmech;
int rc = 0;
- rc = cifs_alloc_hash("hmac(sha256)",
- &p->hmacsha256,
- &p->sdeschmacsha256);
+ rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
if (rc)
return rc;
- rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
if (rc)
goto err;
- rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512);
+ rc = cifs_alloc_hash("sha512", &p->sha512);
if (rc)
goto err;
return 0;
err:
- cifs_free_hash(&p->cmacaes, &p->sdesccmacaes);
- cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ cifs_free_hash(&p->aes_cmac);
+ cifs_free_hash(&p->hmacsha256);
return rc;
}
@@ -219,34 +215,30 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
- struct shash_desc *shash;
- struct crypto_shash *hash;
- struct sdesc *sdesc = NULL;
+ struct shash_desc *shash = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
- if (!ses) {
+ if (unlikely(!ses)) {
cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
- return 0;
+ return -ENOENT;
}
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
if (allocate_crypto) {
- rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
+ rc = cifs_alloc_hash("hmac(sha256)", &shash);
if (rc) {
cifs_server_dbg(VFS,
"%s: sha256 alloc failed\n", __func__);
goto out;
}
- shash = &sdesc->shash;
} else {
- hash = server->secmech.hmacsha256;
- shash = &server->secmech.sdeschmacsha256->shash;
+ shash = server->secmech.hmacsha256;
}
- rc = crypto_shash_setkey(hash, ses->auth_key.response,
+ rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response,
SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS,
@@ -288,7 +280,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
out:
if (allocate_crypto)
- cifs_free_hash(&hash, &sdesc);
+ cifs_free_hash(&shash);
if (ses)
cifs_put_smb_ses(ses);
return rc;
@@ -315,42 +307,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
goto smb3signkey_ret;
}
- rc = crypto_shash_setkey(server->secmech.hmacsha256,
+ rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm,
ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+ rc = crypto_shash_init(server->secmech.hmacsha256);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- i, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, i, 4);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- label.iov_base, label.iov_len);
+ rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- &zero, 1);
+ rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- context.iov_base, context.iov_len);
+ rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__);
goto smb3signkey_ret;
@@ -358,19 +346,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- L256, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4);
} else {
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- L128, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4);
}
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
- hashptr);
+ rc = crypto_shash_final(server->secmech.hmacsha256, hashptr);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
goto smb3signkey_ret;
@@ -550,38 +535,35 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
- struct shash_desc *shash;
- struct crypto_shash *hash;
- struct sdesc *sdesc = NULL;
+ struct shash_desc *shash = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
- if (rc)
- return 0;
+ if (unlikely(rc)) {
+ cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__);
+ return rc;
+ }
if (allocate_crypto) {
- rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
+ rc = cifs_alloc_hash("cmac(aes)", &shash);
if (rc)
return rc;
-
- shash = &sdesc->shash;
} else {
- hash = server->secmech.cmacaes;
- shash = &server->secmech.sdesccmacaes->shash;
+ shash = server->secmech.aes_cmac;
}
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
+ rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
goto out;
}
/*
- * we already allocate sdesccmacaes when we init smb3 signing key,
+ * we already allocate aes_cmac when we init smb3 signing key,
* so unlike smb2 case we do not have to check here if secmech are
* initialized
*/
@@ -617,7 +599,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
out:
if (allocate_crypto)
- cifs_free_hash(&hash, &sdesc);
+ cifs_free_hash(&shash);
return rc;
}
@@ -902,7 +884,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
{
struct crypto_aead *tfm;
- if (!server->secmech.ccmaesencrypt) {
+ if (!server->secmech.enc) {
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
@@ -913,23 +895,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
__func__);
return PTR_ERR(tfm);
}
- server->secmech.ccmaesencrypt = tfm;
+ server->secmech.enc = tfm;
}
- if (!server->secmech.ccmaesdecrypt) {
+ if (!server->secmech.dec) {
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
else
tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
if (IS_ERR(tfm)) {
- crypto_free_aead(server->secmech.ccmaesencrypt);
- server->secmech.ccmaesencrypt = NULL;
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n",
__func__);
return PTR_ERR(tfm);
}
- server->secmech.ccmaesdecrypt = tfm;
+ server->secmech.dec = tfm;
}
return 0;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5fbbec22bcc8..90789aaa6567 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -90,7 +90,7 @@ int smbd_max_send_size = 1364;
int smbd_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
-int smbd_max_receive_size = 8192;
+int smbd_max_receive_size = 1364;
/* The timeout to initiate send of a keepalive message on idle */
int smbd_keep_alive_interval = 120;
@@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120;
* User configurable initial values for RDMA transport
* The actual values used may be lower and are limited to hardware capabilities
*/
-/* Default maximum number of SGEs in a RDMA write/read */
+/* Default maximum number of pages in a single RDMA write/read */
int smbd_max_frmr_depth = 2048;
/* If payload is less than this byte, use RDMA send/recv not read/write */
@@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbd_request *request =
container_of(wc->wr_cqe, struct smbd_request, cqe);
- log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
+ log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
request, wc->status);
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
@@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbd_connection *info = response->info;
int data_length = 0;
- log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n",
+ log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
response, response->type, wc->status, wc->opcode,
wc->byte_len, wc->pkey_index);
@@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
- log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
+ log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
@@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info,
for (i = 0; i < request->num_sge; i++) {
log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu length=%u\n",
+ "rdma_request sge[%d] addr=0x%llx length=%u\n",
i, request->sge[i].addr, request->sge[i].length);
ib_dma_sync_single_for_device(
info->id->device,
@@ -1017,9 +1017,9 @@ static int smbd_post_send_data(
{
int i;
u32 data_length = 0;
- struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+ struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1];
- if (n_vec > SMBDIRECT_MAX_SGE) {
+ if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) {
cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
return -EINVAL;
}
@@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info)
response->type = SMBD_NEGOTIATE_RESP;
rc = smbd_post_recv(info, response);
- log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n",
+ log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
response->sge.length, response->sge.lkey);
if (rc)
@@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection(
if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
+ log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_send_credit_target,
info->id->device->attrs.max_cqe,
info->id->device->attrs.max_qp_wr);
@@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection(
if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
+ log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_receive_credit_max,
info->id->device->attrs.max_cqe,
info->id->device->attrs.max_qp_wr);
@@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection(
info->max_receive_size = smbd_max_receive_size;
info->keep_alive_interval = smbd_keep_alive_interval;
- if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) {
+ if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
+ info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
log_rdma_event(ERR,
- "warning: device max_send_sge = %d too small\n",
- info->id->device->attrs.max_send_sge);
- log_rdma_event(ERR, "Queue Pair creation may fail\n");
- }
- if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) {
- log_rdma_event(ERR,
- "warning: device max_recv_sge = %d too small\n",
+ "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+ IB_DEVICE_NAME_MAX,
+ info->id->device->name,
+ info->id->device->attrs.max_send_sge,
info->id->device->attrs.max_recv_sge);
- log_rdma_event(ERR, "Queue Pair creation may fail\n");
+ goto config_failed;
}
info->send_cq = NULL;
@@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection(
qp_attr.qp_context = info;
qp_attr.cap.max_send_wr = info->send_credit_target;
qp_attr.cap.max_recv_wr = info->receive_credit_max;
- qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
- qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
+ qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
+ qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
qp_attr.cap.max_inline_data = 0;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
@@ -1986,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
- struct kvec vec;
+ struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1];
int nvecs;
int size;
unsigned int buflen, remaining_data_length;
+ unsigned int offset, remaining_vec_data_length;
int start, i, j;
int max_iov_size =
info->max_send_size - sizeof(struct smbd_data_transfer);
@@ -1998,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server,
struct smb_rqst *rqst;
int rqst_idx;
- if (info->transport_status != SMBD_CONNECTED) {
- rc = -EAGAIN;
- goto done;
- }
+ if (info->transport_status != SMBD_CONNECTED)
+ return -EAGAIN;
/*
* Add in the page array if there is one. The caller needs to set
@@ -2012,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (remaining_data_length > info->max_fragmented_send_size) {
+ if (unlikely(remaining_data_length > info->max_fragmented_send_size)) {
+ /* assertion: payload never exceeds negotiated maximum */
log_write(ERR, "payload size %d > max size %d\n",
remaining_data_length, info->max_fragmented_send_size);
- rc = -EINVAL;
- goto done;
+ return -EINVAL;
}
log_write(INFO, "num_rqst=%d total length=%u\n",
num_rqst, remaining_data_length);
rqst_idx = 0;
-next_rqst:
- rqst = &rqst_array[rqst_idx];
- iov = rqst->rq_iov;
-
- cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
- rqst_idx, smb_rqst_len(server, rqst));
- for (i = 0; i < rqst->rq_nvec; i++)
- dump_smb(iov[i].iov_base, iov[i].iov_len);
-
-
- log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n",
- rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
- rqst->rq_tailsz, smb_rqst_len(server, rqst));
-
- start = i = 0;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen-iov[i].iov_len);
- log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n",
- start, i, i - start,
- remaining_data_length);
- rc = smbd_post_send_data(
- info, &iov[start], i-start,
- remaining_data_length);
- if (rc)
- goto done;
- } else {
- /* iov[start] is too big, break it */
- nvecs = (buflen+max_iov_size-1)/max_iov_size;
- log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n",
- start, iov[start].iov_base,
- buflen, nvecs);
- for (j = 0; j < nvecs; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j*max_iov_size;
- vec.iov_len = max_iov_size;
- if (j == nvecs-1)
- vec.iov_len =
- buflen -
- max_iov_size*(nvecs-1);
- remaining_data_length -= vec.iov_len;
- log_write(INFO,
- "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n",
- j, vec.iov_base, vec.iov_len,
- remaining_data_length);
- rc = smbd_post_send_data(
- info, &vec, 1,
- remaining_data_length);
- if (rc)
- goto done;
+ do {
+ rqst = &rqst_array[rqst_idx];
+ iov = rqst->rq_iov;
+
+ cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
+ rqst_idx, smb_rqst_len(server, rqst));
+ remaining_vec_data_length = 0;
+ for (i = 0; i < rqst->rq_nvec; i++) {
+ remaining_vec_data_length += iov[i].iov_len;
+ dump_smb(iov[i].iov_base, iov[i].iov_len);
+ }
+
+ log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n",
+ rqst_idx, rqst->rq_nvec,
+ rqst->rq_npages, rqst->rq_pagesz,
+ rqst->rq_tailsz, smb_rqst_len(server, rqst));
+
+ start = 0;
+ offset = 0;
+ do {
+ buflen = 0;
+ i = start;
+ j = 0;
+ while (i < rqst->rq_nvec &&
+ j < SMBDIRECT_MAX_SEND_SGE - 1 &&
+ buflen < max_iov_size) {
+
+ vecs[j].iov_base = iov[i].iov_base + offset;
+ if (buflen + iov[i].iov_len > max_iov_size) {
+ vecs[j].iov_len =
+ max_iov_size - iov[i].iov_len;
+ buflen = max_iov_size;
+ offset = vecs[j].iov_len;
+ } else {
+ vecs[j].iov_len =
+ iov[i].iov_len - offset;
+ buflen += vecs[j].iov_len;
+ offset = 0;
+ ++i;
}
- i++;
- if (i == rqst->rq_nvec)
- break;
+ ++j;
}
+
+ remaining_vec_data_length -= buflen;
+ remaining_data_length -= buflen;
+ log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n",
+ remaining_vec_data_length > 0 ?
+ "partial" : "complete",
+ rqst->rq_nvec, start, j,
+ remaining_data_length);
+
start = i;
- buflen = 0;
- } else {
- i++;
- if (i == rqst->rq_nvec) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n",
- start, i, i - start,
+ rc = smbd_post_send_data(info, vecs, j, remaining_data_length);
+ if (rc)
+ goto done;
+ } while (remaining_vec_data_length > 0);
+
+ /* now sending pages if there are any */
+ for (i = 0; i < rqst->rq_npages; i++) {
+ rqst_page_get_length(rqst, i, &buflen, &offset);
+ nvecs = (buflen + max_iov_size - 1) / max_iov_size;
+ log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
+ buflen, nvecs);
+ for (j = 0; j < nvecs; j++) {
+ size = min_t(unsigned int, max_iov_size, remaining_data_length);
+ remaining_data_length -= size;
+ log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n",
+ i, j * max_iov_size + offset, size,
remaining_data_length);
- rc = smbd_post_send_data(info, &iov[start],
- i-start, remaining_data_length);
+ rc = smbd_post_send_page(
+ info, rqst->rq_pages[i],
+ j*max_iov_size + offset,
+ size, remaining_data_length);
if (rc)
goto done;
- break;
}
}
- log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
- }
-
- /* now sending pages if there are any */
- for (i = 0; i < rqst->rq_npages; i++) {
- unsigned int offset;
-
- rqst_page_get_length(rqst, i, &buflen, &offset);
- nvecs = (buflen + max_iov_size - 1) / max_iov_size;
- log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
- buflen, nvecs);
- for (j = 0; j < nvecs; j++) {
- size = max_iov_size;
- if (j == nvecs-1)
- size = buflen - j*max_iov_size;
- remaining_data_length -= size;
- log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n",
- i, j * max_iov_size + offset, size,
- remaining_data_length);
- rc = smbd_post_send_page(
- info, rqst->rq_pages[i],
- j*max_iov_size + offset,
- size, remaining_data_length);
- if (rc)
- goto done;
- }
- }
-
- rqst_idx++;
- if (rqst_idx < num_rqst)
- goto next_rqst;
+ } while (++rqst_idx < num_rqst);
done:
/*
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index a87fca82a796..207ef979cd51 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -91,7 +91,7 @@ struct smbd_connection {
/* Memory registrations */
/* Maximum number of RDMA read/write outstanding on this connection */
int responder_resources;
- /* Maximum number of SGEs in a RDMA write/read */
+ /* Maximum number of pages in a single RDMA write/read on this connection */
int max_frmr_depth;
/*
* If payload is less than or equal to the threshold,
@@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 {
__le32 length;
} __packed;
-/* Default maximum number of SGEs in a RDMA send/recv */
-#define SMBDIRECT_MAX_SGE 16
+/* Maximum number of SGEs used by smbdirect.c in any send work request */
+#define SMBDIRECT_MAX_SEND_SGE 6
+
/* The context for a SMBD request */
struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
- /* the SGE entries for this packet */
- struct ib_sge sge[SMBDIRECT_MAX_SGE];
+ /* the SGE entries for this work request */
+ struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE];
int num_sge;
/* SMBD packet header follows this structure */
u8 packet[];
};
+/* Maximum number of SGEs used by smbdirect.c in any receive work request */
+#define SMBDIRECT_MAX_RECV_SGE 1
+
/* The context for a SMBD response */
struct smbd_response {
struct smbd_connection *info;
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 6b88dc2e364f..110070ba8b04 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
@@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
@@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
/*
* For logging SMB3 Status code and Command for responses which return errors
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9a2753e21170..575fa8f58342 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -753,8 +753,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_freezekillable_unsafe(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED);
+ error = wait_event_state(server->response_q,
+ midQ->mid_state != MID_REQUEST_SUBMITTED,
+ (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
diff --git a/fs/coredump.c b/fs/coredump.c
index 3538f3a63965..7bad7785e8e6 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -354,7 +354,7 @@ static int zap_process(struct task_struct *start, int exit_code)
struct task_struct *t;
int nr = 0;
- /* ignore all signals except SIGKILL, see prepare_signal() */
+ /* Allow SIGKILL, see prepare_signal() */
start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
@@ -402,9 +402,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
if (core_waiters > 0) {
struct core_thread *ptr;
- freezer_do_not_count();
- wait_for_completion(&core_state->startup);
- freezer_count();
+ wait_for_completion_state(&core_state->startup,
+ TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
@@ -412,7 +411,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
*/
ptr = core_state->dumper.next;
while (ptr != NULL) {
- wait_task_inactive(ptr->task, 0);
+ wait_task_inactive(ptr->task, TASK_ANY);
ptr = ptr->next;
}
}
@@ -1101,30 +1100,20 @@ whole:
return vma->vm_end - vma->vm_start;
}
-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-
/*
* Helper function for iterating across a vma list. It ensures that the caller
* will visit `gate_vma' prior to terminating the search.
*/
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+static struct vm_area_struct *coredump_next_vma(struct ma_state *mas,
+ struct vm_area_struct *vma,
struct vm_area_struct *gate_vma)
{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
+ if (gate_vma && (vma == gate_vma))
return NULL;
+
+ vma = mas_next(mas, ULONG_MAX);
+ if (vma)
+ return vma;
return gate_vma;
}
@@ -1148,9 +1137,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
*/
static bool dump_vma_snapshot(struct coredump_params *cprm)
{
- struct vm_area_struct *vma, *gate_vma;
+ struct vm_area_struct *gate_vma, *vma = NULL;
struct mm_struct *mm = current->mm;
- int i;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int i = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1170,8 +1160,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
return false;
}
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma), i++) {
+ while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) {
struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
@@ -1179,10 +1168,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
m->pgoff = vma->vm_pgoff;
-
m->file = vma->vm_file;
if (m->file)
get_file(m->file);
+ i++;
}
mmap_write_unlock(mm);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 1cca09aa43f8..2a24b1f0ae68 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -205,14 +205,19 @@ static int allocate_filesystem_keyring(struct super_block *sb)
}
/*
- * This is called at unmount time to release all encryption keys that have been
- * added to the filesystem, along with the keyring that contains them.
+ * Release all encryption keys that have been added to the filesystem, along
+ * with the keyring that contains them.
*
- * Note that besides clearing and freeing memory, this might need to evict keys
- * from the keyslots of an inline crypto engine. Therefore, this must be called
- * while the filesystem's underlying block device(s) are still available.
+ * This is called at unmount time. The filesystem's underlying block device(s)
+ * are still available at this time; this is important because after user file
+ * accesses have been allowed, this function may need to evict keys from the
+ * keyslots of an inline crypto engine, which requires the block device(s).
+ *
+ * This is also called when the super_block is being freed. This is needed to
+ * avoid a memory leak if mounting fails after the "test_dummy_encryption"
+ * option was processed, as in that case the unmount-time call isn't made.
*/
-void fscrypt_sb_delete(struct super_block *sb)
+void fscrypt_destroy_keyring(struct super_block *sb)
{
struct fscrypt_keyring *keyring = sb->s_master_keys;
size_t i;
diff --git a/fs/d_path.c b/fs/d_path.c
index e4e0ebad1f15..56a6ee4c6331 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c)
}
/*
- * The source of the prepend data can be an optimistoc load
+ * The source of the prepend data can be an optimistic load
* of a dentry name and length. And because we don't hold any
* locks, the length and the pointer to the name may not be
* in sync if a concurrent rename happens, and the kernel
@@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path);
/*
* Helper function for dentry_operations.d_dname() members
*/
-char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
- const char *fmt, ...)
+char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
va_list args;
char temp[64];
diff --git a/fs/dcache.c b/fs/dcache.c
index bb0c4d0038db..52e6d5fdab6b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
- /*
- * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT
- * kernels spin_lock() implicitly disables preemption, but not on
- * PREEMPT_RT. So for RT it has to be done explicitly to protect
- * the sequence count write side critical section against a reader
- * or another writer preempting, which would result in a live lock.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
wait_queue_head_t *d_wait)
{
smp_store_release(&dir->i_dir_seq, n + 2);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
wake_up_all(d_wait);
}
@@ -3258,8 +3249,10 @@ void d_genocide(struct dentry *parent)
EXPORT_SYMBOL(d_genocide);
-void d_tmpfile(struct dentry *dentry, struct inode *inode)
+void d_tmpfile(struct file *file, struct inode *inode)
{
+ struct dentry *dentry = file->f_path.dentry;
+
inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 950c63fa4d0b..ddb3fc258df9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1121,7 +1121,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
}
EXPORT_SYMBOL_GPL(debugfs_print_regs32);
-static int debugfs_show_regset32(struct seq_file *s, void *data)
+static int debugfs_regset32_show(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
@@ -1136,17 +1136,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
return 0;
}
-static int debugfs_open_regset32(struct inode *inode, struct file *file)
-{
- return single_open(file, debugfs_show_regset32, inode->i_private);
-}
-
-static const struct file_operations fops_regset32 = {
- .open = debugfs_open_regset32,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(debugfs_regset32);
/**
* debugfs_create_regset32 - create a debugfs file that returns register values
@@ -1167,7 +1157,7 @@ void debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset)
{
- debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 232cfdf095ae..2e8e112b1993 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -82,6 +82,8 @@ struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = DEBUGFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
* but traditionally debugfs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int debugfs_apply_options(struct super_block *sb)
+static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct debugfs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
- inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- return 0;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
+
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
+}
+
+static void debugfs_apply_options(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, false);
+}
+
+static void debugfs_apply_options_remount(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, true);
}
static int debugfs_remount(struct super_block *sb, int *flags, char *data)
@@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- debugfs_apply_options(sb);
+ debugfs_apply_options_remount(sb);
fail:
return err;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f669163d5860..03d381377ae1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
unsigned long flags;
bio->bi_private = dio;
- /* don't account direct I/O as memory stall */
- bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5f2b49e13731..f2ed0c0266cb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -506,7 +506,7 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
}
-static inline struct path *
+static inline const struct path *
ecryptfs_dentry_to_lower_path(struct dentry *dentry)
{
return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 18d5b91cb573..268b74499c28 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,7 +33,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- struct path *path;
+ const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
@@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback {
};
/* Inspired by generic filldir in fs/readdir.c */
-static int
+static bool
ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
{
@@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
container_of(ctx, struct ecryptfs_getdents_callback, ctx);
size_t name_size;
char *name;
- int rc;
+ int err;
+ bool res;
buf->filldir_called++;
- rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
- buf->sb, lower_name,
- lower_namelen);
- if (rc) {
- if (rc != -EINVAL) {
+ err = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+ buf->sb, lower_name,
+ lower_namelen);
+ if (err) {
+ if (err != -EINVAL) {
ecryptfs_printk(KERN_DEBUG,
"%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n",
- __func__, lower_name, rc);
- return rc;
+ __func__, lower_name, err);
+ return false;
}
/* Mask -EINVAL errors as these are most likely due a plaintext
@@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
* the "lost+found" dentry in the root directory of an Ext4
* filesystem.
*/
- return 0;
+ return true;
}
buf->caller->pos = buf->ctx.pos;
- rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
+ res = dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
- if (!rc)
+ if (res)
buf->entries_written++;
-
- return rc;
+ return res;
}
/**
@@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
lower_file = ecryptfs_file_to_lower(file);
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
- if (rc < 0)
- goto out;
- if (buf.filldir_called && !buf.entries_written)
- goto out;
- if (rc >= 0)
- fsstack_copy_attr_atime(inode,
- file_inode(lower_file));
-out:
+ if (rc >= 0 && (buf.entries_written || !buf.filldir_called))
+ fsstack_copy_attr_atime(inode, file_inode(lower_file));
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 16d50dface59..c214fe0981bd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -317,7 +317,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 2dd23a82e0de..2dc927ba067f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -105,7 +105,7 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index a0ef63cfcecb..9e4f47808bd5 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
if (err)
return err;
- /*
- * Ensure that the available space hasn't shrunk below the safe level
- */
- status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
- if (status != EFI_SUCCESS) {
- if (status != EFI_UNSUPPORTED) {
- err = efi_status_to_err(status);
- goto out;
- }
-
- if (*size > 65536) {
- err = -ENOSPC;
- goto out;
- }
- }
-
status = efivar_set_variable_locked(name, vendor, attributes, *size,
data, false);
if (status != EFI_SUCCESS) {
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 998cd26a1b3b..fe05bc51f9f2 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -590,14 +590,17 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
mutex_lock(&erofs_domain_cookies_lock);
+ spin_lock(&psb->s_inode_list_lock);
list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
ctx = inode->i_private;
if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
continue;
igrab(inode);
+ spin_unlock(&psb->s_inode_list_lock);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
+ spin_unlock(&psb->s_inode_list_lock);
ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index cce56dde135c..c7f24fc7efd5 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -7,6 +7,7 @@
#include "zdata.h"
#include "compress.h"
#include <linux/prefetch.h>
+#include <linux/psi.h>
#include <trace/events/erofs.h>
@@ -812,15 +813,14 @@ retry:
++spiltted;
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
-
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
if (fe->pcl->length < offset + end - map->m_la) {
fe->pcl->length = offset + end - map->m_la;
fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
}
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
next_part:
/* shorten the remaining extent to update progress */
map->m_llen = offset + cur - map->m_la;
@@ -887,15 +887,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
unsigned int pgnr;
- struct page *oldpage;
pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
DBG_BUGON(pgnr >= be->nr_pages);
- oldpage = be->decompressed_pages[pgnr];
- be->decompressed_pages[pgnr] = bvec->page;
-
- if (!oldpage)
+ if (!be->decompressed_pages[pgnr]) {
+ be->decompressed_pages[pgnr] = bvec->page;
return;
+ }
}
/* (cold path) one pcluster is requested multiple times */
@@ -1414,6 +1412,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
+ /* initialize to 1 to make skip psi_memstall_leave unless needed */
+ unsigned long pflags = 1;
bi_private = jobqueueset_init(sb, q, fgq, force_fg);
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
@@ -1463,10 +1463,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
if (bio && (cur != last_index + 1 ||
last_bdev != mdev.m_bdev)) {
submit_bio_retry:
+ if (!pflags)
+ psi_memstall_leave(&pflags);
submit_bio(bio);
bio = NULL;
}
+ if (unlikely(PageWorkingset(page)))
+ psi_memstall_enter(&pflags);
+
if (!bio) {
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
@@ -1494,8 +1499,11 @@ submit_bio_retry:
move_to_bypass_jobqueue(pcl, qtail, owned_head);
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
- if (bio)
+ if (bio) {
+ if (!pflags)
+ psi_memstall_leave(&pflags);
submit_bio(bio);
+ }
/*
* although background is preferred, no one is pending for submission.
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index e7f04c4fbb81..d98c95212985 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
}
/*
- * bit 31: I/O error occurred on this page
- * bit 0 - 30: remaining parts to complete this page
+ * bit 30: I/O error occurred on this page
+ * bit 0 - 29: remaining parts to complete this page
*/
-#define Z_EROFS_PAGE_EIO (1 << 31)
+#define Z_EROFS_PAGE_EIO (1 << 30)
static inline void z_erofs_onlinepage_init(struct page *page)
{
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 44c27ef39c43..0bb66927e3d0 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -57,8 +57,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
- EROFS_KMAP_ATOMIC);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(kaddr)) {
err = PTR_ERR(kaddr);
goto out_unlock;
@@ -73,7 +72,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
vi->z_tailextent_headlcn = 0;
- goto unmap_done;
+ goto done;
}
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
@@ -85,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
- goto unmap_done;
+ goto out_put_metabuf;
}
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
@@ -95,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto unmap_done;
+ goto out_put_metabuf;
}
if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
@@ -103,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto unmap_done;
+ goto out_put_metabuf;
}
-unmap_done:
- erofs_put_metabuf(&buf);
- if (err)
- goto out_unlock;
if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
struct erofs_map_blocks map = {
@@ -127,7 +122,7 @@ unmap_done:
err = -EFSCORRUPTED;
}
if (err < 0)
- goto out_unlock;
+ goto out_put_metabuf;
}
if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
@@ -141,11 +136,14 @@ unmap_done:
EROFS_GET_BLOCKS_FINDTAIL);
erofs_put_metabuf(&map.buf);
if (err < 0)
- goto out_unlock;
+ goto out_put_metabuf;
}
+done:
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+out_put_metabuf:
+ erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3627dd7d25db..c0ffee99ad23 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(current->in_eventfd_signal))
+ if (WARN_ON_ONCE(current->in_eventfd))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- current->in_eventfd_signal = 1;
+ current->in_eventfd = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- current->in_eventfd_signal = 0;
+ current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
@@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
__set_current_state(TASK_RUNNING);
}
eventfd_ctx_do_read(ctx, &ucnt);
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
return -EFAULT;
@@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
if (likely(res > 0)) {
ctx->count += ucnt;
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8b56b94e2f56..52954d4637b5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new,
* added to the list from another CPU: the winner observes
* new->next == new.
*/
- if (cmpxchg(&new->next, new, head) != new)
+ if (!try_cmpxchg(&new->next, &new, head))
return false;
/*
diff --git a/fs/exec.c b/fs/exec.c
index 69a572fc57db..32dc8cf5fceb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -683,6 +682,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ struct vm_area_struct *next;
struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@@ -691,7 +692,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* ensure there are no vmas between where we want to go
* and where we are
*/
- if (vma != find_vma(mm, new_start))
+ if (vma != vma_next(&vmi))
return -EFAULT;
/*
@@ -710,12 +711,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -724,7 +726,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);
@@ -1010,6 +1012,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+ lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -1022,9 +1025,8 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
- tsk->mm->vmacache_seqnum = 0;
- vmacache_flush(tsk);
task_unlock(tsk);
+ lru_gen_use_mm(mm);
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
@@ -1195,11 +1197,11 @@ static int unshare_sighand(struct task_struct *me)
return -ENOMEM;
refcount_set(&newsighand->count, 1);
- memcpy(newsighand->action, oldsighand->action,
- sizeof(newsighand->action));
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
+ memcpy(newsighand->action, oldsighand->action,
+ sizeof(newsighand->action));
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
@@ -1587,7 +1589,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
/* Handle suid and sgid on files */
struct user_namespace *mnt_userns;
- struct inode *inode;
+ struct inode *inode = file_inode(file);
unsigned int mode;
kuid_t uid;
kgid_t gid;
@@ -1598,7 +1600,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
if (task_no_new_privs(current))
return;
- inode = file->f_path.dentry->d_inode;
mode = READ_ONCE(inode->i_mode);
if (!(mode & (S_ISUID|S_ISGID)))
return;
@@ -1880,7 +1881,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index a27b55ec060a..0fc08fdcba73 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -212,9 +212,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb)
/* skip iterating emit_dots when dir is empty */
#define ITER_POS_FILLED_DOTS (2)
-static int exfat_iterate(struct file *filp, struct dir_context *ctx)
+static int exfat_iterate(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct inode *tmp;
struct exfat_dir_entry de;
@@ -228,7 +228,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
mutex_lock(&EXFAT_SB(sb)->s_lock);
cpos = ctx->pos;
- if (!dir_emit_dots(filp, ctx))
+ if (!dir_emit_dots(file, ctx))
goto unlock;
if (ctx->pos == ITER_POS_FILLED_DOTS) {
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index a795437b86d0..5590a1e83126 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -552,7 +552,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
if (info->attr & ATTR_SUBDIR) { /* directory */
inode->i_generation &= ~1;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ef80d000e13..c648a493faf2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -248,21 +248,20 @@ struct getdents_callback {
* A rather strange filldir function to capture
* the name matching the specified inode number.
*/
-static int filldir_one(struct dir_context *ctx, const char *name, int len,
+static bool filldir_one(struct dir_context *ctx, const char *name, int len,
loff_t pos, u64 ino, unsigned int d_type)
{
struct getdents_callback *buf =
container_of(ctx, struct getdents_callback, ctx);
- int result = 0;
buf->sequence++;
if (buf->ino == ino && len <= NAME_MAX) {
memcpy(buf->name, name, len);
buf->name[len] = '\0';
buf->found = 1;
- result = -1;
+ return false; // no more
}
- return result;
+ return true;
}
/**
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c17ccc19b938..5dc0a31f4a08 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
struct ext2_group_desc * desc;
struct buffer_head * bh = NULL;
ext2_fsblk_t bitmap_blk;
+ int ret;
desc = ext2_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
block_group, le32_to_cpu(desc->bg_block_bitmap));
return NULL;
}
- if (likely(bh_uptodate_or_lock(bh)))
+ ret = bh_read(bh, 0);
+ if (ret > 0)
return bh;
-
- if (bh_submit_read(bh) < 0) {
+ if (ret < 0) {
brelse(bh);
ext2_error(sb, __func__,
"Cannot read block bitmap - "
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 998dd2ac8008..f4944c4dee60 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- group = prandom_u32();
- parent_group = (unsigned)group % ngroups;
+ parent_group = prandom_u32_max(ngroups);
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext2_get_group_desc (sb, group, NULL);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5fd9a22d2b70..9125eab85146 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns,
}
static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode = ext2_new_inode(dir, mode, NULL);
if (IS_ERR(inode))
@@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
ext2_set_file_ops(inode);
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
- return 0;
+ return finish_open_simple(file, 0);
}
static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 252c742379cf..03f2af98b1b4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb)
db_count = sbi->s_gdb_count;
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -1052,6 +1052,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group);
goto failed_mount;
}
+ /* At least inode table, bitmaps, and sb have to fit in one group */
+ if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) {
+ ext2_msg(sb, KERN_ERR,
+ "error: #blocks per group smaller than metadata size: %lu <= %lu",
+ sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3);
+ goto failed_mount;
+ }
if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
"error: #fragments per group too big: %lu",
@@ -1065,9 +1072,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_inodes_per_group);
goto failed_mount;
}
+ if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) {
+ ext2_msg(sb, KERN_ERR,
+ "bad geometry: block count %u exceeds size of device (%u blocks)",
+ le32_to_cpu(es->s_blocks_count),
+ (unsigned)sb_bdev_nr_blocks(sb));
+ goto failed_mount;
+ }
- if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext2;
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
@@ -1080,7 +1092,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc_array(db_count,
+ sbi->s_group_desc = kvmalloc_array(db_count,
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -1206,7 +1218,7 @@ failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
failed_mount:
brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e5f2f5ca5120..8d5453852f98 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3592,9 +3592,6 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len);
extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
@@ -3713,7 +3710,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path **,
int flags);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5235974126bd..f1956288307f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
return 0;
}
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth, i;
+
+ if (!path)
+ return;
+ depth = path->p_depth;
+ for (i = 0; i <= depth; i++, path++) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+void ext4_free_ext_path(struct ext4_ext_path *path)
+{
+ ext4_ext_drop_refs(path);
+ kfree(path);
+}
+
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
@@ -636,8 +655,7 @@ int ext4_ext_precache(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -724,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
-void ext4_ext_drop_refs(struct ext4_ext_path *path)
-{
- int depth, i;
-
- if (!path)
- return;
- depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
-}
-
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
@@ -955,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
return path;
err:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (orig_path)
*orig_path = NULL;
return ERR_PTR(ret);
@@ -2174,8 +2178,7 @@ merge:
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
- ext4_ext_drop_refs(npath);
- kfree(npath);
+ ext4_free_ext_path(npath);
return err;
}
@@ -3061,8 +3064,7 @@ again:
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
path = NULL;
if (err == -EAGAIN)
goto again;
@@ -4375,8 +4377,7 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -5245,8 +5246,7 @@ again:
break;
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -5538,15 +5538,13 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret < 0) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
} else {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ret = ext4_es_remove_extent(inode, offset_lblk,
@@ -5766,10 +5764,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
count -= len;
repeat:
- ext4_ext_drop_refs(path1);
- kfree(path1);
- ext4_ext_drop_refs(path2);
- kfree(path2);
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
path1 = path2 = NULL;
}
return replaced_count;
@@ -5848,8 +5844,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return err ? err : mapped;
}
@@ -5916,8 +5911,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return ret;
}
@@ -5935,8 +5929,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
return;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return;
}
@@ -5949,8 +5942,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
ext4_mark_inode_dirty(NULL, inode);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
}
@@ -5989,13 +5981,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
goto out;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
/* Count the number of data blocks */
cur = 0;
@@ -6025,30 +6015,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
while (cur < end) {
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
ret = skip_hole(inode, &cur);
if (ret < 0) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
path2 = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path2)) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
@@ -6062,10 +6048,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
- ext4_ext_drop_refs(path);
- ext4_ext_drop_refs(path2);
- kfree(path);
- kfree(path2);
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
}
out:
@@ -6092,13 +6076,11 @@ int ext4_ext_clear_bb(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
cur = 0;
while (cur < end) {
@@ -6117,8 +6099,7 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 23167efda95e..cd0a861853e3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -667,8 +667,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 2af962cbb835..ef05bfa87798 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
finish_wait(wq, &wait.wq_entry);
}
+static bool ext4_fc_disabled(struct super_block *sb)
+{
+ return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
+}
+
/*
* Inform Ext4's fast about start of an inode update
*
@@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (atomic_dec_and_test(&ei->i_fc_updates))
@@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid;
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(sb))
return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
@@ -493,10 +495,8 @@ void __ext4_fc_track_unlink(handle_t *handle,
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -522,10 +522,8 @@ void __ext4_fc_track_link(handle_t *handle,
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -551,10 +549,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -576,22 +572,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
return;
}
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return;
-
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
@@ -634,15 +628,13 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -710,10 +702,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
* After allocating len, we should have space at least for a 0 byte
* padding.
*/
- if (len + sizeof(struct ext4_fc_tl) > bsize)
+ if (len + EXT4_FC_TAG_BASE_LEN > bsize)
return NULL;
- if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
+ if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
/*
* Only allocate from current buffer if we have enough space for
* this request AND we have space to add a zero byte padding.
@@ -730,10 +722,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
/* Need to add PAD tag */
tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
+ pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
tl->fc_len = cpu_to_le16(pad_len);
if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
+ *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
if (pad_len > 0)
ext4_fc_memzero(sb, tl + 1, pad_len, crc);
ext4_fc_submit_bh(sb, false);
@@ -775,7 +767,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
* ext4_fc_reserve_space takes care of allocating an extra block if
* there's no enough space on this block for accommodating this tail.
*/
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
if (!dst)
return -ENOSPC;
@@ -785,8 +777,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
- dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
dst += sizeof(tail.fc_tid);
@@ -808,15 +800,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
struct ext4_fc_tl tl;
u8 *dst;
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
if (!dst)
return false;
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
return true;
}
@@ -828,8 +820,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
int dlen = fc_dentry->fcd_name.len;
- u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
- crc);
+ u8 *dst = ext4_fc_reserve_space(sb,
+ EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
if (!dst)
return false;
@@ -838,8 +830,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
dst += sizeof(fcd);
ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
@@ -874,22 +866,25 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+ ret = -ECANCELED;
dst = ext4_fc_reserve_space(inode->i_sb,
- sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
+ EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
if (!dst)
- return -ECANCELED;
+ goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
- return -ECANCELED;
- dst += sizeof(tl);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
+ goto err;
+ dst += EXT4_FC_TAG_BASE_LEN;
if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- return -ECANCELED;
+ goto err;
dst += sizeof(fc_inode);
if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
inode_len, crc))
- return -ECANCELED;
-
- return 0;
+ goto err;
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
}
/*
@@ -1343,7 +1338,7 @@ struct dentry_info_args {
};
static inline void tl_to_darg(struct dentry_info_args *darg,
- struct ext4_fc_tl *tl, u8 *val)
+ struct ext4_fc_tl *tl, u8 *val)
{
struct ext4_fc_dentry_info fcd;
@@ -1352,8 +1347,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg,
darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
darg->ino = le32_to_cpu(fcd.fc_ino);
darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
- darg->dname_len = le16_to_cpu(tl->fc_len) -
- sizeof(struct ext4_fc_dentry_info);
+ darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
+}
+
+static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
+{
+ memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl->fc_len);
+ tl->fc_tag = le16_to_cpu(tl->fc_tag);
}
/* Unlink replay function */
@@ -1491,13 +1492,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes,
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
sizeof(int) * (state->fc_modified_inodes_size +
EXT4_FC_REPLAY_REALLOC_INCREMENT),
GFP_KERNEL);
- if (!state->fc_modified_inodes)
+ if (!fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
state->fc_modified_inodes_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
@@ -1516,7 +1519,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
struct ext4_inode *raw_fc_inode;
struct inode *inode = NULL;
struct ext4_iloc iloc;
- int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+ int inode_len, ino, ret, tag = tl->fc_tag;
struct ext4_extent_header *eh;
memcpy(&fc_inode, val, sizeof(fc_inode));
@@ -1541,7 +1544,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
if (ret)
goto out;
- inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
+ inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
raw_inode = ext4_raw_inode(&iloc);
memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
@@ -1682,15 +1685,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
if (replay && state->fc_regions_used != state->fc_regions_valid)
state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
- state->fc_regions = krealloc(
- state->fc_regions,
- state->fc_regions_size *
- sizeof(struct ext4_fc_alloc_region),
- GFP_KERNEL);
- if (!state->fc_regions)
- return -ENOMEM;
+ state->fc_regions = fc_regions;
}
region = &state->fc_regions[state->fc_regions_used++];
region->ino = ino;
@@ -1770,8 +1776,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ret = ext4_ext_insert_extent(
NULL, inode, &path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret)
goto out;
goto next;
@@ -1926,8 +1931,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 1);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
@@ -1972,6 +1976,34 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
+static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
+ u8 *val, u8 *end)
+{
+ if (val + tl->fc_len > end)
+ return false;
+
+ /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
+ * journal rescan before do CRC check. Other tags length check will
+ * rely on CRC check.
+ */
+ switch (tl->fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
+ case EXT4_FC_TAG_TAIL:
+ return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
+ case EXT4_FC_TAG_HEAD:
+ return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ default:
+ return true;
+ }
+}
+
/*
* Recovery Scan phase handler
*
@@ -2028,12 +2060,18 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+ if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ goto out_err;
+ }
ext4_debug("Scan phase, tag:%s, blk %lld\n",
- tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
- switch (le16_to_cpu(tl.fc_tag)) {
+ tag2str(tl.fc_tag), bh->b_blocknr);
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_ADD_RANGE:
memcpy(&ext, val, sizeof(ext));
ex = (struct ext4_extent *)&ext.fc_ex;
@@ -2053,13 +2091,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) +
+ EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
if (le32_to_cpu(tail.fc_tid) == expected_tid &&
@@ -2086,7 +2124,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
ret = state->fc_replay_num_tags ?
@@ -2141,19 +2179,20 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
start = (u8 *)bh->b_data;
end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
if (state->fc_replay_num_tags == 0) {
ret = JBD2_FC_REPLAY_STOP;
ext4_fc_set_bitmaps_and_counters(sb);
break;
}
- ext4_debug("Replay phase, tag:%s\n",
- tag2str(le16_to_cpu(tl.fc_tag)));
+
+ ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
state->fc_replay_num_tags--;
- switch (le16_to_cpu(tl.fc_tag)) {
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_LINK:
ret = ext4_fc_replay_link(sb, &tl, val);
break;
@@ -2174,19 +2213,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
break;
case EXT4_FC_TAG_PAD:
trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
- le16_to_cpu(tl.fc_len), 0);
+ tl.fc_len, 0);
break;
case EXT4_FC_TAG_TAIL:
- trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
+ 0, tl.fc_len, 0);
memcpy(&tail, val, sizeof(tail));
WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
break;
case EXT4_FC_TAG_HEAD:
break;
default:
- trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
ret = -ECANCELED;
break;
}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 1db12847a83b..a6154c3ed135 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -70,6 +70,9 @@ struct ext4_fc_tail {
__le32 fc_crc;
};
+/* Tag base length */
+#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
+
/*
* Fast commit status codes
*/
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8bb1c35fd6dd..a7a597c727e6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -543,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = -EAGAIN;
goto out;
}
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
offset = iocb->ki_pos;
count = ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 208b87ce8858..e9bc46684106 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
- grp = hinfo.hash;
+ parent_group = hinfo.hash % ngroups;
} else
- grp = prandom_u32();
- parent_group = (unsigned)grp % ngroups;
+ parent_group = prandom_u32_max(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -1280,7 +1279,7 @@ got:
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 364774230d87..2b5ef1b64249 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1188,6 +1188,13 @@ retry_grab:
page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -5342,6 +5349,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
+ bool inc_ivers = true;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -5425,8 +5433,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;
}
- if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
+ if (attr->ia_size == inode->i_size)
+ inc_ivers = false;
if (shrink) {
if (ext4_should_order_data(inode)) {
@@ -5528,6 +5536,8 @@ out_mmap_sem:
}
if (!error) {
+ if (inc_ivers)
+ inode_inc_iversion(inode);
setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
}
@@ -5768,9 +5778,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
}
ext4_fc_track_inode(handle, inode);
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3cf3ec4b1c21..ded535535b27 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -452,9 +452,10 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
- inode_bl->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
+ inode_bl->i_generation = get_random_u32();
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
@@ -665,6 +666,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_set_inode_flags(inode, false);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -775,6 +777,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
EXT4_I(inode)->i_projid = kprojid;
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -1060,9 +1063,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (!EXT4_SB(sb)->s_journal)
return -ENODEV;
- if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
- return -EINVAL;
-
if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
!bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
return -EOPNOTSUPP;
@@ -1257,6 +1257,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 54e7d3c95fd7..0a220ec9862d 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
lb->first_pblock = 0;
return retval;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9af68a7ecdcf..588cb09c5291 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void)
u32 new_seq;
do {
- new_seq = prandom_u32();
+ new_seq = get_random_u32();
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 701f1d6a217f..044e34cd835c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -32,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
if (IS_ERR(path))
return PTR_ERR(path);
if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
*ppath = NULL;
return -ENODATA;
}
@@ -103,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
- ext4_ext_drop_refs(path);
}
ret = 1;
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -472,19 +469,17 @@ mext_check_arguments(struct inode *orig_inode,
if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
return -EPERM;
- /* Ext4 move extent does not support swapfile */
+ /* Ext4 move extent does not support swap files */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be swapfile [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -ETXTBSY;
}
if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be quota files [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -EOPNOTSUPP;
}
/* Ext4 move extent supports only extent based file */
@@ -631,11 +626,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (ret)
goto out;
ex = path[path->p_depth].p_ext;
- next_blk = ext4_ext_next_allocated_block(path);
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
+ next_blk = ext4_ext_next_allocated_block(path);
if (next_blk == EXT_MAX_BLOCKS) {
ret = -ENODATA;
goto out;
@@ -663,7 +658,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page- offset_in_page)
+ if (cur_len > blocks_per_page - offset_in_page)
cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
@@ -694,8 +689,7 @@ out:
ext4_discard_preallocations(donor_inode, 0);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3a31b662f661..d5daaf41e1fc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -85,15 +85,20 @@ static struct buffer_head *ext4_append(handle_t *handle,
return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ goto out;
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
- return ERR_PTR(err);
- }
+ if (err)
+ goto out;
return bh;
+
+out:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
}
static int ext4_dx_csum_verify(struct inode *inode,
@@ -126,7 +131,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- if (block >= inode->i_size) {
+ if (block >= inode->i_size >> inode->i_blkbits) {
ext4_error_inode(inode, func, line, block,
"Attempting to read directory block (%u) that is past i_size (%llu)",
block, inode->i_size);
@@ -2849,7 +2854,7 @@ retry:
}
static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
handle_t *handle;
struct inode *inode;
@@ -2871,7 +2876,7 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_unlock_inode;
@@ -2882,7 +2887,7 @@ retry:
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return finish_open_simple(file, err);
err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index fea2a68d067b..6dfe9ccae0c5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -2122,7 +2122,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9a66abcca1a8..989365b878a6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -205,19 +205,12 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io
int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- if (trylock_buffer(bh)) {
- if (wait)
- return ext4_read_bh(bh, op_flags, NULL);
+ lock_buffer(bh);
+ if (!wait) {
ext4_read_bh_nowait(bh, op_flags, NULL);
return 0;
}
- if (wait) {
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
- }
- return 0;
+ return ext4_read_bh(bh, op_flags, NULL);
}
/*
@@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
if (likely(bh)) {
- ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
brelse(bh);
}
}
@@ -1576,7 +1570,7 @@ enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_user_xattr, Opt_acl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
@@ -1585,7 +1579,7 @@ enum {
Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota,
Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
@@ -1662,9 +1656,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("oldalloc", Opt_removed),
fsparam_flag ("orlov", Opt_removed),
fsparam_flag ("user_xattr", Opt_user_xattr),
- fsparam_flag ("nouser_xattr", Opt_nouser_xattr),
fsparam_flag ("acl", Opt_acl),
- fsparam_flag ("noacl", Opt_noacl),
fsparam_flag ("norecovery", Opt_noload),
fsparam_flag ("noload", Opt_noload),
fsparam_flag ("bh", Opt_removed),
@@ -1694,7 +1686,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("barrier", Opt_barrier),
fsparam_u32 ("barrier", Opt_barrier),
fsparam_flag ("nobarrier", Opt_nobarrier),
- fsparam_flag ("i_version", Opt_i_version),
+ fsparam_flag ("i_version", Opt_removed),
fsparam_flag ("dax", Opt_dax),
fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
fsparam_u32 ("stripe", Opt_stripe),
@@ -1814,13 +1806,10 @@ static const struct mount_opts {
{Opt_journal_ioprio, 0, MOPT_NO_EXT2},
{Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
- {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
- {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
{Opt_acl, 0, MOPT_NOSUPPORT},
- {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
@@ -2120,10 +2109,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
return note_qf_name(fc, GRPQUOTA, param);
#endif
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5");
- break;
case Opt_sb:
if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
ext4_msg(NULL, KERN_WARNING,
@@ -2140,11 +2125,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_abort:
ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0;
- case Opt_i_version:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
- ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n");
- ctx_set_flags(ctx, SB_I_VERSION);
- return 0;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
ctx_set_flags(ctx, SB_INLINECRYPT);
@@ -2814,14 +2794,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
sb->s_flags &= ~ctx->mask_s_flags;
sb->s_flags |= ctx->vals_s_flags;
- /*
- * i_version differs from common mount option iversion so we have
- * to let vfs know that it was set, otherwise it would get cleared
- * on remount
- */
- if (ctx->mask_s_flags & SB_I_VERSION)
- fc->sb_flags |= SB_I_VERSION;
-
#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
APPLY(s_commit_interval);
APPLY(s_stripe);
@@ -2970,8 +2942,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & SB_I_VERSION)
- SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -3767,6 +3737,7 @@ static int ext4_lazyinit_thread(void *arg)
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -3811,8 +3782,7 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- (prandom_u32()
- % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
@@ -3959,8 +3929,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + (prandom_u32() %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ elr->lr_next_sched = jiffies + prandom_u32_max(
+ EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -3982,9 +3952,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
- (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE)))
+ if (sb_rdonly(sb) ||
+ (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -4311,112 +4281,10 @@ err_out:
return NULL;
}
-static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+static void ext4_set_def_opts(struct super_block *sb,
+ struct ext4_super_block *es)
{
- struct buffer_head *bh, **group_desc;
- struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups **flex_groups;
- ext4_fsblk_t block;
- ext4_fsblk_t logical_sb_block;
- unsigned long offset = 0;
unsigned long def_mount_opts;
- struct inode *root;
- int ret = -ENOMEM;
- int blocksize, clustersize;
- unsigned int db_count;
- unsigned int i;
- int needs_recovery, has_huge_files;
- __u64 blocks_count;
- int err = 0;
- ext4_group_t first_not_zeroed;
- struct ext4_fs_context *ctx = fc->fs_private;
- int silent = fc->sb_flags & SB_SILENT;
-
- /* Set defaults for the variables that will be set during parsing */
- if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-
- sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
-
- /* -EINVAL is default */
- ret = -EINVAL;
- blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext4_msg(sb, KERN_ERR, "unable to set blocksize");
- goto out_fail;
- }
-
- /*
- * The ext4 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- } else {
- logical_sb_block = sbi->s_sb_block;
- }
-
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR, "unable to read superblock");
- ret = PTR_ERR(bh);
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext4 macro-instructions depend on its value
- */
- es = (struct ext4_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT4_SUPER_MAGIC)
- goto cantfind_ext4;
- sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
-
- /* Warn if metadata_csum and gdt_csum are both set. */
- if (ext4_has_feature_metadata_csum(sb) &&
- ext4_has_feature_gdt_csum(sb))
- ext4_warning(sb, "metadata_csum and uninit_bg are "
- "redundant flags; please run fsck.");
-
- /* Check for a known checksum algorithm */
- if (!ext4_verify_csum_type(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "unknown checksum algorithm.");
- silent = 1;
- goto cantfind_ext4;
- }
- ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
- ext4_orphan_file_block_trigger);
-
- /* Load the checksum driver */
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
-
- /* Check superblock checksum */
- if (!ext4_superblock_csum_verify(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "invalid superblock checksum. Run e2fsck?");
- silent = 1;
- ret = -EFSBADCRC;
- goto cantfind_ext4;
- }
-
- /* Precompute checksum seed for all metadata */
- if (ext4_has_feature_csum_seed(sb))
- sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
- sizeof(es->s_uuid));
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -4445,9 +4313,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
@@ -4456,12 +4324,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
- sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
- sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
- sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);
@@ -4473,31 +4335,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- /*
- * set default s_li_wait_mult for lazyinit, for the case there is
- * no mount option specified.
- */
- sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
+}
- if (le32_to_cpu(es->s_log_block_size) >
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log block size: %u",
- le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
- if (le32_to_cpu(es->s_log_cluster_size) >
- (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log cluster size: %u",
- le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
+static int ext4_handle_clustersize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int clustersize;
+
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ if (ext4_has_feature_bigalloc(sb)) {
+ if (clustersize < sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ } else {
+ if (clustersize != sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "fragment/cluster size (%d) != "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ return -EINVAL;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
- blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
- if (blocksize == PAGE_SIZE)
- set_opt(sb, DIOREAD_NOLOCK);
+ return 0;
+}
+
+static void ext4_fast_commit_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ sbi->s_fc_ineligible_tid = 0;
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+}
+
+static int ext4_inode_info_init(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -4508,16 +4435,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
sbi->s_first_ino);
- goto failed_mount;
+ return -EINVAL;
}
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
+ (sbi->s_inode_size > sb->s_blocksize)) {
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
- ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
- goto failed_mount;
+ ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
+ return -EINVAL;
}
/*
* i_atime_extra is the last extra field available for
@@ -4535,6 +4462,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
+
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
@@ -4546,7 +4474,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_want_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
@@ -4555,91 +4483,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_min_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
}
}
- err = parse_apply_sb_mount_options(sb, ctx);
- if (err < 0)
- goto failed_mount;
+ return 0;
+}
- sbi->s_def_mount_opt = sbi->s_mount_opt;
+#if IS_ENABLED(CONFIG_UNICODE)
+static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
- err = ext4_check_opt_consistency(fc, sb);
- if (err < 0)
- goto failed_mount;
+ if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
+ return 0;
- ext4_apply_options(fc, sb);
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ return -EINVAL;
+ }
-#if IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
- const struct ext4_sb_encodings *encoding_info;
- struct unicode_map *encoding;
- __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%u.%u.%u "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
+ return -EINVAL;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
- encoding_info = ext4_sb_read_encoding(es);
- if (!encoding_info) {
- ext4_msg(sb, KERN_ERR,
- "Encoding requested by superblock is unknown");
- goto failed_mount;
- }
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
- encoding = utf8_load(encoding_info->version);
- if (IS_ERR(encoding)) {
- ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%u.%u.%u "
- "not supported by the kernel. flags: 0x%x.",
- encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
- goto failed_mount;
- }
- ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
+ return 0;
+}
+#else
+static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ return 0;
+}
+#endif
- sb->s_encoding = encoding;
- sb->s_encoding_flags = encoding_flags;
+static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ return -EINVAL;
}
-#endif
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
- /* can't mount with both data=journal and dioread_nolock. */
- clear_opt(sb, DIOREAD_NOLOCK);
- clear_opt2(sb, JOURNAL_FAST_COMMIT);
- if (test_opt2(sb, EXPLICIT_DELALLOC)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
- goto failed_mount;
- }
- if (test_opt(sb, DAX_ALWAYS)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- goto failed_mount;
- }
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_WARNING,
- "encrypted files will use data=ordered "
- "instead of data journaling mode");
- }
- if (test_opt(sb, DELALLOC))
- clear_opt(sb, DELALLOC);
- } else {
- sb->s_iflags |= SB_I_CGROUPWB;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ int ret = PTR_ERR(sbi->s_chksum_driver);
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ sbi->s_chksum_driver = NULL;
+ return ret;
}
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ return -EFSBADCRC;
+ }
+ /* Precompute checksum seed for all metadata */
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+ return 0;
+}
+
+static int ext4_check_feature_compatibility(struct super_block *sb,
+ struct ext4_super_block *es,
+ int silent)
+{
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
ext4_has_ro_compat_features(sb) ||
@@ -4653,7 +4602,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4663,7 +4612,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4677,10 +4626,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext[34] filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4694,10 +4643,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext4 filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4707,9 +4656,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ext4_geometry_check(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ __u64 blocks_count;
+
+ /* check blocks count against device size */
+ blocks_count = sb_bdev_nr_blocks(sb);
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ return -EINVAL;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
+ return -EINVAL;
+ }
+ if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
+ (sbi->s_cluster_ratio == 1)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block is 0 with a 1k block and cluster size");
+ return -EINVAL;
+ }
+
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", blocks_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ return -EINVAL;
+ }
+ sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
+ le32_to_cpu(es->s_inodes_count),
+ ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void ext4_group_desc_free(struct ext4_sb_info *sbi)
+{
+ struct buffer_head **group_desc;
+ int i;
+
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
+}
+
+static int ext4_group_desc_init(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t logical_sb_block,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int db_count;
+ ext4_fsblk_t block;
+ int ret;
+ int i;
+
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ return -EINVAL;
+ }
+ }
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
+ if (sbi->s_group_desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ return -ENOMEM;
+ }
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ ext4_sb_breadahead_unmovable(sb, block);
+ }
+
+ for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
+ block = descriptor_loc(sb, logical_sb_block, i);
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
+ sbi->s_gdb_count = i;
+ ret = PTR_ERR(bh);
+ goto out;
+ }
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
+ }
+ sbi->s_gdb_count = db_count;
+ if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ return 0;
+out:
+ ext4_group_desc_free(sbi);
+ return ret;
+}
+
+static int ext4_load_and_init_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ struct ext4_fs_context *ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_64bit(sb) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto out;
+ }
+
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto out;
+ }
+
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto out;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ set_opt(sb, ORDERED_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
+ set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto out;
+ }
+
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
+
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
+
+ return 0;
+
+out:
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ return err;
+}
+
+static int ext4_journal_data_mode_check(struct super_block *sb)
+{
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
+ "data=journal disables delayed allocation, "
+ "dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ return -EINVAL;
+ }
+ if (test_opt(sb, DAX_ALWAYS)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
+ }
+
+ return 0;
+}
+
+static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es;
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ struct buffer_head *bh;
+ int ret = -EINVAL;
+ int blocksize;
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ return -EINVAL;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sbi->s_sb_block;
+ }
+
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ return PTR_ERR(bh);
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto out;
+ }
+
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
+ goto out;
+ }
+ if (le32_to_cpu(es->s_log_cluster_size) >
+ (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log cluster size: %u",
+ le32_to_cpu(es->s_log_cluster_size));
+ goto out;
+ }
+
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ /*
+ * If the default block size is not the same as the real block size,
+ * we need to reload it.
+ */
+ if (sb->s_blocksize == blocksize) {
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+ }
+
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ bh = NULL;
+ goto out;
+ }
+
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
+ es = (struct ext4_super_block *)(bh->b_data + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
+ goto out;
+ }
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups **flex_groups;
+ ext4_fsblk_t block;
+ ext4_fsblk_t logical_sb_block;
+ struct inode *root;
+ int ret = -ENOMEM;
+ unsigned int i;
+ int needs_recovery, has_huge_files;
+ int err = 0;
+ ext4_group_t first_not_zeroed;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
+
+ /* Set defaults for the variables that will be set during parsing */
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
+
+ /* -EINVAL is default */
+ ret = -EINVAL;
+ err = ext4_load_super(sb, &logical_sb_block, silent);
+ if (err)
+ goto out_fail;
+
+ es = sbi->s_es;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ err = ext4_init_metadata_csum(sb, es);
+ if (err)
+ goto failed_mount;
+
+ ext4_set_def_opts(sb, es);
+
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ if (ext4_inode_info_init(sb, es))
+ goto failed_mount;
+
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
goto failed_mount;
- if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
+ ext4_apply_options(fc, sb);
+
+ if (ext4_encoding_init(sb, es))
+ goto failed_mount;
+
+ if (ext4_journal_data_mode_check(sb))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+
+ /* i_version is always enabled now */
+ sb->s_flags |= SB_I_VERSION;
+
+ if (ext4_check_feature_compatibility(sb, es, silent))
+ goto failed_mount;
+
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
ext4_msg(sb, KERN_ERR,
"Number of reserved GDT blocks insanely large: %d",
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
@@ -4717,7 +5120,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
if (sbi->s_daxdev) {
- if (blocksize == PAGE_SIZE)
+ if (sb->s_blocksize == PAGE_SIZE)
set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
else
ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
@@ -4742,40 +5145,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (sb->s_blocksize != blocksize) {
- /*
- * bh must be released before kill_bdev(), otherwise
- * it won't be freed and its page also. kill_bdev()
- * is called by sb_set_blocksize().
- */
- brelse(bh);
- /* Validate the filesystem blocksize */
- if (!sb_set_blocksize(sb, blocksize)) {
- ext4_msg(sb, KERN_ERR, "bad block size %d",
- blocksize);
- bh = NULL;
- goto failed_mount;
- }
-
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "Can't read superblock on 2nd try");
- ret = PTR_ERR(bh);
- bh = NULL;
- goto failed_mount;
- }
- es = (struct ext4_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- ext4_msg(sb, KERN_ERR,
- "Magic mismatch, very weird!");
- goto failed_mount;
- }
- }
-
has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
@@ -4797,19 +5166,21 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext4;
+ sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto failed_mount;
+ }
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
- sbi->s_inodes_per_group > blocksize * 8) {
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
- sbi->s_sbh = bh;
+ sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -4835,54 +5206,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
}
- /* Handle clustersize */
- clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- if (ext4_has_feature_bigalloc(sb)) {
- if (clustersize < blocksize) {
- ext4_msg(sb, KERN_ERR,
- "cluster size (%d) smaller than "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
- le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- } else {
- if (clustersize != blocksize) {
- ext4_msg(sb, KERN_ERR,
- "fragment/cluster size (%d) != "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
- sbi->s_cluster_bits = 0;
- }
- sbi->s_cluster_ratio = clustersize / blocksize;
-
- /* Do we have standard group size of clustersize * 8 blocks ? */
- if (sbi->s_blocks_per_group == clustersize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
+ if (ext4_handle_clustersize(sb))
+ goto failed_mount;
/*
* Test whether we have more sectors than will fit in sector_t,
@@ -4896,111 +5221,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
- /* check blocks count against device size */
- blocks_count = sb_bdev_nr_blocks(sb);
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
- ext4_blocks_count(es), blocks_count);
+ if (ext4_geometry_check(sb, es))
goto failed_mount;
- }
- /*
- * It makes no sense for the first data block to be beyond the end
- * of the filesystem.
- */
- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block %u is beyond end of filesystem (%llu)",
- le32_to_cpu(es->s_first_data_block),
- ext4_blocks_count(es));
- goto failed_mount;
- }
- if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
- (sbi->s_cluster_ratio == 1)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block is 0 with a 1k block and cluster size");
- goto failed_mount;
- }
-
- blocks_count = (ext4_blocks_count(es) -
- le32_to_cpu(es->s_first_data_block) +
- EXT4_BLOCKS_PER_GROUP(sb) - 1);
- do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
- if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
- "(block count %llu, first data block %u, "
- "blocks per group %lu)", blocks_count,
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
- goto failed_mount;
- }
- sbi->s_groups_count = blocks_count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
- (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
- if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
- le32_to_cpu(es->s_inodes_count)) {
- ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
- le32_to_cpu(es->s_inodes_count),
- ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
- }
- db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- if (ext4_has_feature_meta_bg(sb)) {
- if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
- ext4_msg(sb, KERN_WARNING,
- "first meta block group too large: %u "
- "(group descriptor block count %u)",
- le32_to_cpu(es->s_first_meta_bg), db_count);
- goto failed_mount;
- }
- }
- rcu_assign_pointer(sbi->s_group_desc,
- kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL));
- if (sbi->s_group_desc == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory");
- ret = -ENOMEM;
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
goto failed_mount;
- }
-
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- /* Pre-read the descriptors into the buffer cache */
- for (i = 0; i < db_count; i++) {
- block = descriptor_loc(sb, logical_sb_block, i);
- ext4_sb_breadahead_unmovable(sb, block);
- }
-
- for (i = 0; i < db_count; i++) {
- struct buffer_head *bh;
-
- block = descriptor_loc(sb, logical_sb_block, i);
- bh = ext4_sb_bread_unmovable(sb, block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "can't read group descriptor %d", i);
- db_count = i;
- ret = PTR_ERR(bh);
- goto failed_mount2;
- }
- rcu_read_lock();
- rcu_dereference(sbi->s_group_desc)[i] = bh;
- rcu_read_unlock();
- }
- sbi->s_gdb_count = db_count;
- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto failed_mount2;
- }
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
@@ -5038,24 +5264,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- /* Initialize fast commit stuff */
- atomic_set(&sbi->s_fc_subtid, 0);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
- sbi->s_fc_bytes = 0;
- ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
- memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
- sbi->s_fc_replay_state.fc_regions = NULL;
- sbi->s_fc_replay_state.fc_regions_size = 0;
- sbi->s_fc_replay_state.fc_regions_used = 0;
- sbi->s_fc_replay_state.fc_regions_valid = 0;
- sbi->s_fc_replay_state.fc_modified_inodes = NULL;
- sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
- sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+ ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5072,37 +5281,37 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
@@ -5110,76 +5319,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
- goto no_journal;
- }
-
- if (ext4_has_feature_64bit(sb) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount_wq;
- }
-
- if (!set_journal_csum_feature_set(sb)) {
- ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
- "feature set");
- goto failed_mount_wq;
- }
-
- if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
- ext4_msg(sb, KERN_ERR,
- "Failed to set fast commit journal feature");
- goto failed_mount_wq;
- }
-
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- * capabilities: ORDERED_DATA if the journal can
- * cope, else JOURNAL_DATA
- */
- if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- set_opt(sb, ORDERED_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
- } else {
- set_opt(sb, JOURNAL_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
- }
- break;
-
- case EXT4_MOUNT_ORDERED_DATA:
- case EXT4_MOUNT_WRITEBACK_DATA:
- if (!jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- ext4_msg(sb, KERN_ERR, "Journal does not support "
- "requested data journaling mode");
- goto failed_mount_wq;
- }
- break;
- default:
- break;
- }
-
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit in data=ordered mode");
- goto failed_mount_wq;
}
- set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
-
- sbi->s_journal->j_submit_inode_data_buffers =
- ext4_journal_submit_inode_data_buffers;
- sbi->s_journal->j_finish_inode_data_buffers =
- ext4_journal_finish_inode_data_buffers;
-
-no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
@@ -5198,7 +5339,7 @@ no_journal:
}
}
- if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
}
@@ -5408,11 +5549,6 @@ no_journal:
return 0;
-cantfind_ext4:
- if (!silent)
- ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
- goto failed_mount;
-
failed_mount9:
ext4_release_orphan_info(sb);
failed_mount8:
@@ -5466,13 +5602,7 @@ failed_mount3:
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
-failed_mount2:
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < db_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- rcu_read_unlock();
+ ext4_group_desc_free(sbi);
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5487,7 +5617,7 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
- brelse(bh);
+ brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
@@ -6653,7 +6783,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index b051d19b5c8a..3c640bd7ecae 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -298,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
last_extent = path[path->p_depth].p_ext;
if (!last_extent) {
EXT4_ERROR_INODE(inode, "verity file has no extents");
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return -EFSCORRUPTED;
}
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
desc_size_pos = (u64)end_lblk << inode->i_blkbits;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
goto bad;
@@ -365,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 533216e80fa2..36d6ba7190b6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2412,6 +2412,7 @@ retry_inode:
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
if (!value)
no_expand = 0;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index eaa240b21f07..5bbc44a5216e 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8259e0fa97e1..0c82dae082aa 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -26,12 +26,16 @@
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
-void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
+ unsigned char reason)
{
f2fs_build_fault_attr(sbi, 0, 0);
set_ckpt_flags(sbi, CP_ERROR_FLAG);
- if (!end_io)
+ if (!end_io) {
f2fs_flush_merged_writes(sbi);
+
+ f2fs_handle_stop(sbi, reason);
+ }
}
/*
@@ -89,7 +93,7 @@ repeat:
return ERR_PTR(err);
}
- f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -122,7 +126,7 @@ retry:
if (PTR_ERR(page) == -EIO &&
++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE);
}
return page;
}
@@ -140,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
unsigned int segno, offset;
bool exist;
- if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ)
+ if (type == DATA_GENERIC)
return true;
segno = GET_SEGNO(sbi, blkaddr);
@@ -148,6 +152,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
se = get_seg_entry(sbi, segno);
exist = f2fs_test_bit(offset, se->cur_valid_map);
+ if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return exist;
+ }
+
if (!exist && type == DATA_GENERIC_ENHANCE) {
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
@@ -185,6 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
case DATA_GENERIC_ENHANCE_READ:
+ case DATA_GENERIC_ENHANCE_UPDATE:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi))) {
f2fs_warn(sbi, "access invalid blkaddr:%u",
@@ -276,7 +288,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
f2fs_put_page(page, err ? 1 : 0);
if (!err)
- f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_META_READ_IO,
+ F2FS_BLKSIZE);
}
out:
blk_finish_plug(&plug);
@@ -448,8 +461,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
set_page_private_reference(&folio->page);
return true;
@@ -1053,7 +1065,8 @@ void f2fs_remove_dirty_inode(struct inode *inode)
spin_unlock(&sbi->inode_lock[type]);
}
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp)
{
struct list_head *head;
struct inode *inode;
@@ -1088,11 +1101,15 @@ retry:
if (inode) {
unsigned long cur_ino = inode->i_ino;
- F2FS_I(inode)->cp_task = current;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = current;
+ F2FS_I(inode)->wb_task = current;
filemap_fdatawrite(inode->i_mapping);
- F2FS_I(inode)->cp_task = NULL;
+ F2FS_I(inode)->wb_task = NULL;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = NULL;
iput(inode);
/* We need to give cpu to another writers. */
@@ -1221,7 +1238,7 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
+ err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true);
if (err)
return err;
cond_resched();
@@ -1892,15 +1909,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct task_struct *ckpt_task;
+
+ if (!cprc->f2fs_issue_ckpt)
+ return;
- if (cprc->f2fs_issue_ckpt) {
- struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
+ ckpt_task = cprc->f2fs_issue_ckpt;
+ cprc->f2fs_issue_ckpt = NULL;
+ kthread_stop(ckpt_task);
- cprc->f2fs_issue_ckpt = NULL;
- kthread_stop(ckpt_task);
+ f2fs_flush_ckpt_thread(sbi);
+}
- flush_remained_ckpt_reqs(sbi, NULL);
- }
+void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ flush_remained_ckpt_reqs(sbi, NULL);
+
+ /* Let's wait for the previous dispatched checkpoint. */
+ while (atomic_read(&cprc->queued_ckpt))
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 70e97075e535..d315c2de136f 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -762,6 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION);
goto out_release;
}
@@ -912,17 +913,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
reason = "[C|*|C|*]";
goto out;
}
- if (compressed) {
- if (!__is_valid_data_blkaddr(blkaddr)) {
- if (!cluster_end)
- cluster_end = i;
- continue;
- }
- /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
- if (cluster_end) {
- reason = "[C|N|N|V]";
- goto out;
- }
+ if (!__is_valid_data_blkaddr(blkaddr)) {
+ if (!cluster_end)
+ cluster_end = i;
+ continue;
+ }
+ /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
+ if (cluster_end) {
+ reason = "[C|N|N|V]";
+ goto out;
}
}
return false;
@@ -952,6 +951,7 @@ static int __f2fs_cluster_blocks(struct inode *inode,
if (f2fs_sanity_check_cluster(&dn)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER);
goto fail;
}
@@ -1568,12 +1568,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
if (!dic->cbuf)
return -ENOMEM;
- if (cops->init_decompress_ctx) {
- int ret = cops->init_decompress_ctx(dic);
-
- if (ret)
- return ret;
- }
+ if (cops->init_decompress_ctx)
+ return cops->init_decompress_ctx(dic);
return 0;
}
@@ -1905,7 +1901,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct address_space *mapping = sbi->compress_inode->i_mapping;
+ struct address_space *mapping = COMPRESS_MAPPING(sbi);
struct folio_batch fbatch;
pgoff_t index = 0;
pgoff_t end = MAX_BLKADDR(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 93cc2ec51c2a..a71e818cd67b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -335,7 +335,8 @@ static void f2fs_write_end_io(struct bio *bio)
mempool_free(page, sbi->write_io_dummy);
if (unlikely(bio->bi_status))
- f2fs_stop_checkpoint(sbi, true);
+ f2fs_stop_checkpoint(sbi, true,
+ STOP_CP_REASON_WRITE_FAIL);
continue;
}
@@ -351,7 +352,8 @@ static void f2fs_write_end_io(struct bio *bio)
if (unlikely(bio->bi_status)) {
mapping_set_error(page->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
- f2fs_stop_checkpoint(sbi, true);
+ f2fs_stop_checkpoint(sbi, true,
+ STOP_CP_REASON_WRITE_FAIL);
}
f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
@@ -705,8 +707,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
fio->is_por ? META_POR : (__is_meta_io(fio) ?
- META_GENERIC : DATA_GENERIC_ENHANCE)))
+ META_GENERIC : DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
trace_f2fs_submit_page_bio(page, fio);
@@ -725,7 +729,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(page): WB_DATA_TYPE(fio->page));
+ __read_io_type(page) : WB_DATA_TYPE(fio->page));
__submit_bio(fio->sbi, bio, fio->type);
return 0;
@@ -906,8 +910,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
fio->encrypted_page : fio->page;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
- __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
+ __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) {
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
trace_f2fs_submit_page_bio(page, fio);
@@ -1085,7 +1091,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
}
ClearPageError(page);
inc_page_count(sbi, F2FS_RD_DATA);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
__submit_bio(sbi, bio, DATA);
return 0;
}
@@ -1217,6 +1223,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto put_err;
}
goto got_it;
@@ -1237,6 +1245,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
dn.data_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto put_err;
}
got_it:
@@ -1550,6 +1560,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto sync_out;
}
@@ -1595,6 +1606,8 @@ next_block:
(flag != F2FS_GET_BLOCK_FIEMAP ||
IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_CORRUPTED_CLUSTER);
goto sync_out;
}
if (flag == F2FS_GET_BLOCK_BMAP) {
@@ -1818,7 +1831,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
trace_f2fs_fiemap(inode, 0, phys, len, flags, err);
- if (err || err == 1)
+ if (err)
return err;
}
@@ -2076,6 +2089,8 @@ got_it:
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto out;
}
} else {
@@ -2124,7 +2139,8 @@ submit_and_realloc:
goto submit_and_realloc;
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
- f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
ClearPageError(page);
*last_block_in_bio = block_nr;
goto out;
@@ -2272,8 +2288,7 @@ submit_and_realloc:
refcount_inc(&dic->refcnt);
inc_page_count(sbi, F2FS_RD_DATA);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
ClearPageError(page);
*last_block_in_bio = blkaddr;
}
@@ -2545,7 +2560,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
/* if this is cold file, we should overwrite to avoid fragmentation */
- if (file_is_cold(inode))
+ if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE))
return true;
return check_inplace_update_policy(inode, fio);
@@ -2619,8 +2634,11 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
- DATA_GENERIC_ENHANCE))
+ DATA_GENERIC_ENHANCE)) {
+ f2fs_handle_error(fio->sbi,
+ ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
ipu_force = true;
fio->need_lock = LOCK_DONE;
@@ -2648,6 +2666,7 @@ got_it:
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
goto out_writepage;
}
@@ -2858,7 +2877,7 @@ out:
}
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task && allow_balance)
+ !F2FS_I(inode)->wb_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3158,7 +3177,7 @@ static inline bool __should_serialize_io(struct inode *inode,
struct writeback_control *wbc)
{
/* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
+ if (F2FS_I(inode)->wb_task)
return false;
if (!S_ISREG(inode->i_mode))
@@ -3561,6 +3580,7 @@ repeat:
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
err = f2fs_submit_page_read(inode, page, blkaddr, 0, true);
@@ -3699,8 +3719,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
BUG_ON(folio_test_swapcache(folio));
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
f2fs_update_dirty_folio(inode, folio);
return true;
}
@@ -3972,6 +3991,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret < 0)
return ret;
+ stat_inc_swapfile_inode(inode);
set_inode_flag(inode, FI_PIN_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
@@ -3981,6 +4001,7 @@ static void f2fs_swap_deactivate(struct file *file)
{
struct inode *inode = file_inode(file);
+ stat_dec_swapfile_inode(inode);
clear_inode_flag(inode, FI_PIN_FILE);
}
#else
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c01471573977..a216dcdf6941 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -91,7 +91,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
- si->aw_cnt = sbi->atomic_files;
+ si->aw_cnt = atomic_read(&sbi->atomic_files);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
@@ -135,6 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
si->compr_inode = atomic_read(&sbi->compr_inode);
+ si->swapfile_inode = atomic_read(&sbi->swapfile_inode);
si->compr_blocks = atomic64_read(&sbi->compr_blocks);
si->append = sbi->im[APPEND_INO].ino_num;
si->update = sbi->im[UPDATE_INO].ino_num;
@@ -347,7 +348,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
si->sbi->sb->s_bdev, i++,
- f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+ f2fs_readonly(si->sbi->sb) ? "RO" : "RW",
is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
"Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
if (si->sbi->s_flag) {
@@ -385,6 +386,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_dir);
seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n",
si->compr_inode, si->compr_blocks);
+ seq_printf(s, " - Swapfile Inode: %u\n",
+ si->swapfile_inode);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
@@ -607,6 +610,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->compr_inode, 0);
atomic64_set(&sbi->compr_blocks, 0);
+ atomic_set(&sbi->swapfile_inode, 0);
+ atomic_set(&sbi->atomic_files, 0);
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index d5bd7932fb64..21960a899b6a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1041,6 +1041,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
__func__, le16_to_cpu(de->name_len));
set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT);
goto out;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 866e72b29bd5..932c070173b9 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -544,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
if (!et)
return;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0);
write_lock(&et->lock);
@@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
org_end = dei.fofs + dei.len;
f2fs_bug_on(sbi, pos >= org_end);
- if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
en->ei.len = pos - en->ei.fofs;
prev_en = en;
parts = 1;
@@ -675,7 +675,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
struct rb_node **insert_p = NULL, *insert_parent = NULL;
bool leftmost = false;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len);
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -804,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode)
if (!f2fs_may_extent_tree(inode))
return;
- set_inode_flag(inode, FI_NO_EXTENT);
-
write_lock(&et->lock);
+ set_inode_flag(inode, FI_NO_EXTENT);
__free_extent_tree(sbi, et);
if (et->largest.len) {
et->largest.len = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index aea816a133a8..e6355a5683b7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -266,6 +266,10 @@ enum {
* condition of read on truncated area
* by extent_cache
*/
+ DATA_GENERIC_ENHANCE_UPDATE, /*
+ * strong check on range and segment
+ * bitmap for update case
+ */
META_GENERIC,
};
@@ -274,7 +278,7 @@ enum {
ORPHAN_INO, /* for orphan ino list */
APPEND_INO, /* for append ino list */
UPDATE_INO, /* for update ino list */
- TRANS_DIR_INO, /* for trasactions dir ino list */
+ TRANS_DIR_INO, /* for transactions dir ino list */
FLUSH_INO, /* for multiple device flushing */
MAX_INO_ENTRY, /* max. list */
};
@@ -782,6 +786,7 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
struct task_struct *task; /* lookup and create consistency */
struct task_struct *cp_task; /* separate cp/wb IO stats*/
+ struct task_struct *wb_task; /* indicate inode is in context of writeback */
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
spinlock_t i_size_lock; /* protect last_disk_size */
@@ -1158,7 +1163,10 @@ enum iostat_type {
APP_BUFFERED_IO, /* app buffered write IOs */
APP_WRITE_IO, /* app write IOs */
APP_MAPPED_IO, /* app mapped IOs */
+ APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */
+ APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */
FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */
+ FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */
FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */
FS_META_IO, /* meta IOs from kworker/reclaimer */
FS_GC_DATA_IO, /* data IOs from forground gc */
@@ -1172,6 +1180,8 @@ enum iostat_type {
APP_BUFFERED_READ_IO, /* app buffered read IOs */
APP_READ_IO, /* app read IOs */
APP_MAPPED_READ_IO, /* app mapped read IOs */
+ APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */
+ APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */
FS_DATA_READ_IO, /* data read IOs */
FS_GDATA_READ_IO, /* data read IOs from background gc */
FS_CDATA_READ_IO, /* compressed data read IOs */
@@ -1247,7 +1257,6 @@ enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
DIRTY_META, /* for all dirtied inode metadata */
- ATOMIC_FILE, /* for all atomic files */
NR_INODE_TYPE,
};
@@ -1726,11 +1735,9 @@ struct f2fs_sb_info {
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
spinlock_t gc_urgent_high_lock;
- bool gc_urgent_high_limited; /* indicates having limited trial count */
unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */
/* for skip statistic */
- unsigned int atomic_files; /* # of opened atomic file */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
@@ -1761,6 +1768,8 @@ struct f2fs_sb_info {
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t compr_inode; /* # of compressed inodes */
atomic64_t compr_blocks; /* # of compressed blocks */
+ atomic_t swapfile_inode; /* # of swapfile inodes */
+ atomic_t atomic_files; /* # of opened atomic file */
atomic_t max_aw_cnt; /* max # of atomic writes */
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
@@ -1806,6 +1815,10 @@ struct f2fs_sb_info {
struct workqueue_struct *post_read_wq; /* post read workqueue */
+ unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */
+ spinlock_t error_lock; /* protect errors array */
+ bool error_dirty; /* errors of sb is dirty */
+
struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
@@ -2525,7 +2538,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
- return &ckpt->sit_nat_version_bitmap;
+ return tmp_ptr;
else
return (unsigned char *)ckpt + F2FS_BLKSIZE;
} else {
@@ -3547,6 +3560,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
+void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason);
+void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
@@ -3706,7 +3721,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
/*
* checkpoint.c
*/
-void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
+ unsigned char reason);
+void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi);
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
@@ -3736,7 +3753,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio);
void f2fs_remove_dirty_inode(struct inode *inode);
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
@@ -3858,7 +3876,7 @@ struct f2fs_stat_info {
int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
unsigned int cur_ckpt_time, peak_ckpt_time;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
- int compr_inode;
+ int compr_inode, swapfile_inode;
unsigned long long compr_blocks;
int aw_cnt, max_aw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
@@ -3947,6 +3965,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
(atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
#define stat_sub_compr_blocks(inode, blocks) \
(atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
+#define stat_inc_swapfile_inode(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode))
+#define stat_dec_swapfile_inode(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode))
+#define stat_inc_atomic_inode(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->atomic_files))
+#define stat_dec_atomic_inode(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->atomic_files))
#define stat_inc_meta_count(sbi, blkaddr) \
do { \
if (blkaddr < SIT_I(sbi)->sit_base_addr) \
@@ -3966,7 +3992,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
(atomic_inc(&(sbi)->inplace_count))
#define stat_update_max_atomic_write(inode) \
do { \
- int cur = F2FS_I_SB(inode)->atomic_files; \
+ int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \
int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
@@ -4031,6 +4057,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_dec_compr_inode(inode) do { } while (0)
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
+#define stat_inc_swapfile_inode(inode) do { } while (0)
+#define stat_dec_swapfile_inode(inode) do { } while (0)
+#define stat_inc_atomic_inode(inode) do { } while (0)
+#define stat_dec_atomic_inode(inode) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 791770507328..82cda1258227 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
ret = filemap_fault(vmf);
if (!ret)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
- F2FS_BLKSIZE);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_MAPPED_READ_IO, F2FS_BLKSIZE);
trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
@@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (!PageUptodate(page))
SetPageUptodate(page);
- f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE);
f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_vm_page_mkwrite(page, DATA);
@@ -822,7 +822,12 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
/* disallow direct IO if any of devices has unaligned blksize */
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
-
+ /*
+ * for blkzoned device, fallback direct IO to buffered IO, so
+ * all IOs can be serialized by log-structured write.
+ */
+ if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
+ return true;
if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
@@ -912,9 +917,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!vfsgid_in_group_p(vfsgid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
@@ -1196,6 +1202,7 @@ next_dnode:
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -1480,6 +1487,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
break;
}
@@ -2089,9 +2097,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
}
f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- sbi->atomic_files++;
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ stat_inc_atomic_inode(inode);
set_inode_flag(inode, FI_ATOMIC_FILE);
set_inode_flag(fi->cow_inode, FI_COW_FILE);
@@ -2185,7 +2191,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (ret) {
if (ret == -EROFS) {
ret = 0;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
trace_f2fs_shutdown(sbi, in, ret);
}
@@ -2198,7 +2205,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
ret = freeze_bdev(sb->s_bdev);
if (ret)
goto out;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
thaw_bdev(sb->s_bdev);
break;
@@ -2207,16 +2214,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
ret = f2fs_sync_fs(sb, 1);
if (ret)
goto out;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NOSYNC:
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_METAFLUSH:
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NEED_FSCK:
@@ -3362,8 +3369,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE)))
+ DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
}
while (count) {
@@ -3524,8 +3533,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE)))
+ DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
}
while (count) {
@@ -3797,6 +3808,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
f2fs_put_dnode(&dn);
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
goto out;
}
@@ -4253,7 +4266,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error,
dec_page_count(sbi, F2FS_DIO_READ);
if (error)
return error;
- f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size);
+ f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size);
return 0;
}
@@ -4342,7 +4355,8 @@ skip_read_trace:
} else {
ret = filemap_read(iocb, to, 0);
if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_BUFFERED_READ_IO, ret);
}
if (trace_f2fs_dataread_end_enabled())
trace_f2fs_dataread_end(inode, pos, ret);
@@ -4459,7 +4473,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
if (ret > 0) {
iocb->ki_pos += ret;
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_BUFFERED_IO, ret);
}
return ret;
}
@@ -4472,7 +4487,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
dec_page_count(sbi, F2FS_DIO_WRITE);
if (error)
return error;
- f2fs_update_iostat(sbi, APP_DIRECT_IO, size);
+ f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size);
return 0;
}
@@ -4660,7 +4675,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
skip_write_trace:
/* Do the actual write. */
ret = dio ?
- f2fs_dio_write_iter(iocb, from, &may_need_sync):
+ f2fs_dio_write_iter(iocb, from, &may_need_sync) :
f2fs_buffered_write_iter(iocb, from);
if (trace_f2fs_datawrite_end_enabled())
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6da21d405ce1..4546e01b2ee0 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -74,7 +74,8 @@ static int gc_thread_func(void *data)
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_FAULT_INJECT);
}
if (!sb_start_write_trylock(sbi->sb)) {
@@ -97,14 +98,10 @@ static int gc_thread_func(void *data)
*/
if (sbi->gc_mode == GC_URGENT_HIGH) {
spin_lock(&sbi->gc_urgent_high_lock);
- if (sbi->gc_urgent_high_limited) {
- if (!sbi->gc_urgent_high_remaining) {
- sbi->gc_urgent_high_limited = false;
- spin_unlock(&sbi->gc_urgent_high_lock);
- sbi->gc_mode = GC_NORMAL;
- continue;
- }
+ if (sbi->gc_urgent_high_remaining) {
sbi->gc_urgent_high_remaining--;
+ if (!sbi->gc_urgent_high_remaining)
+ sbi->gc_mode = GC_NORMAL;
}
spin_unlock(&sbi->gc_urgent_high_lock);
}
@@ -285,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
/* let's select beginning hot/small space first in no_heap mode*/
if (f2fs_need_rand_seg(sbi))
- p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
@@ -1082,7 +1079,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node;
+ unsigned int ofs_in_node, max_addrs;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -1108,6 +1105,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
+ max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
+ DEF_ADDRS_PER_BLOCK;
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
+ ofs_in_node, dni->ino, dni->nid, max_addrs);
+ return false;
+ }
+
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
@@ -1159,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
goto got_it;
@@ -1177,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
got_it:
@@ -1206,8 +1213,8 @@ got_it:
f2fs_put_page(fio.encrypted_page, 0);
f2fs_put_page(page, 1);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE);
return 0;
put_encrypted_page:
@@ -1307,8 +1314,10 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto up_out;
}
- f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO,
+ F2FS_BLKSIZE);
lock_page(mpage);
if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
@@ -1360,7 +1369,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto put_page_out;
}
- f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
@@ -1706,7 +1715,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_CORRUPTED_SUMMARY);
goto skip;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index bf46a7dfbea2..21a495234ffd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode)
void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
{
struct inode *inode = page->mapping->host;
- void *src_addr, *dst_addr;
if (PageUptodate(page))
return;
@@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(inode, ipage);
- dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- flush_dcache_page(page);
- kunmap_atomic(dst_addr);
+ memcpy_to_page(page, 0, inline_data_addr(inode, ipage),
+ MAX_INLINE_DATA(inode));
if (!PageUptodate(page))
SetPageUptodate(page);
}
@@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
set_sbi_flag(fio.sbi, SBI_NEED_FSCK);
f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
__func__, dn->inode->i_ino, dn->data_blkaddr);
+ f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -246,7 +243,6 @@ out:
int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
- void *src_addr, *dst_addr;
struct dnode_of_data dn;
int err;
@@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
- src_addr = kmap_atomic(page);
- dst_addr = inline_data_addr(inode, dn.inode_page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- kunmap_atomic(src_addr);
+ memcpy_from_page(inline_data_addr(inode, dn.inode_page),
+ page, 0, MAX_INLINE_DATA(inode));
set_page_dirty(dn.inode_page);
f2fs_clear_page_cache_dirty_tag(page);
@@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
__func__, dir->i_ino, dn.data_blkaddr);
+ f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR);
err = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6d11c365d7b4..9f0d3864d9f1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi,
if (!__is_valid_data_blkaddr(addr))
return 1;
- if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE))
+ if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return true;
}
+static void init_idisk_time(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[3] = fi->i_crtime;
+}
+
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -405,6 +417,7 @@ static int do_read_inode(struct inode *inode)
if (!sanity_check_inode(inode, node_page)) {
f2fs_put_page(node_page, 1);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
}
@@ -465,10 +478,7 @@ static int do_read_inode(struct inode *inode)
}
}
- fi->i_disk_time[0] = inode->i_atime;
- fi->i_disk_time[1] = inode->i_ctime;
- fi->i_disk_time[2] = inode->i_mtime;
- fi->i_disk_time[3] = fi->i_crtime;
+ init_idisk_time(inode);
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -480,6 +490,12 @@ static int do_read_inode(struct inode *inode)
return 0;
}
+static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino)
+{
+ return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) ||
+ ino == F2FS_COMPRESS_INO(sbi);
+}
+
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -491,16 +507,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
+ if (is_meta_ino(sbi, ino)) {
+ f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ ret = -EFSCORRUPTED;
+ trace_f2fs_iget_exit(inode, ret);
+ iput(inode);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
+ return ERR_PTR(ret);
+ }
+
trace_f2fs_iget(inode);
return inode;
}
- if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
- goto make_now;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (ino == F2FS_COMPRESS_INO(sbi))
+ if (is_meta_ino(sbi, ino))
goto make_now;
-#endif
ret = do_read_inode(inode);
if (ret)
@@ -676,11 +698,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
if (inode->i_nlink == 0)
clear_page_private_inline(node_page);
- F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
- F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
- F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
- F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
-
+ init_idisk_time(inode);
#ifdef CONFIG_F2FS_CHECK_FS
f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page);
#endif
@@ -699,7 +717,8 @@ retry:
cond_resched();
goto retry;
} else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_UPDATE_INODE);
}
return;
}
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index d84c5f6cc09d..3166a8939ed4 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
/* print app write IOs */
seq_puts(seq, "[WRITE]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
+ seq_printf(seq, "app buffered data: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
+ seq_printf(seq, "app direct data: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
+ seq_printf(seq, "app mapped data: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_IO]);
+ seq_printf(seq, "app buffered cdata: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_CDATA_IO]);
+ seq_printf(seq, "app mapped cdata: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_CDATA_IO]);
/* print fs write IOs */
- seq_printf(seq, "fs data: %-16llu\n",
+ seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
+ seq_printf(seq, "fs cdata: %-16llu\n",
+ sbi->rw_iostat[FS_CDATA_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
+ seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
+ seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GC_DATA_IO]);
- seq_printf(seq, "fs gc node: %-16llu\n",
+ seq_printf(seq, "fs gc node: %-16llu\n",
sbi->rw_iostat[FS_GC_NODE_IO]);
- seq_printf(seq, "fs cp data: %-16llu\n",
+ seq_printf(seq, "fs cp data: %-16llu\n",
sbi->rw_iostat[FS_CP_DATA_IO]);
- seq_printf(seq, "fs cp node: %-16llu\n",
+ seq_printf(seq, "fs cp node: %-16llu\n",
sbi->rw_iostat[FS_CP_NODE_IO]);
- seq_printf(seq, "fs cp meta: %-16llu\n",
+ seq_printf(seq, "fs cp meta: %-16llu\n",
sbi->rw_iostat[FS_CP_META_IO]);
/* print app read IOs */
seq_puts(seq, "[READ]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
+ seq_printf(seq, "app buffered data: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
+ seq_printf(seq, "app direct data: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_READ_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
+ seq_printf(seq, "app mapped data: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_READ_IO]);
+ seq_printf(seq, "app buffered cdata: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]);
+ seq_printf(seq, "app mapped cdata: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]);
/* print fs read IOs */
- seq_printf(seq, "fs data: %-16llu\n",
+ seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_READ_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
+ seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GDATA_READ_IO]);
- seq_printf(seq, "fs compr_data: %-16llu\n",
+ seq_printf(seq, "fs cdata: %-16llu\n",
sbi->rw_iostat[FS_CDATA_READ_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
+ seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_READ_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
+ seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_READ_IO]);
/* print other IOs */
seq_puts(seq, "[OTHER]\n");
- seq_printf(seq, "fs discard: %-16llu\n",
+ seq_printf(seq, "fs discard: %-16llu\n",
sbi->rw_iostat[FS_DISCARD]);
return 0;
@@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
spin_unlock_irq(&sbi->iostat_lat_lock);
}
-void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes)
{
unsigned long flags;
@@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (inode && f2fs_compressed_file(inode)) {
+ if (type == APP_BUFFERED_IO)
+ sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes;
+
+ if (type == APP_BUFFERED_READ_IO)
+ sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes;
+
+ if (type == APP_MAPPED_READ_IO)
+ sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes;
+
+ if (type == APP_MAPPED_IO)
+ sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes;
+
+ if (type == FS_DATA_READ_IO)
+ sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes;
+
+ if (type == FS_DATA_IO)
+ sbi->rw_iostat[FS_CDATA_IO] += io_bytes;
+ }
+#endif
+
spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h
index 22a2d01f57ef..2c048307b6e0 100644
--- a/fs/f2fs/iostat.h
+++ b/fs/f2fs/iostat.h
@@ -31,7 +31,7 @@ struct iostat_lat_info {
extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
void *offset);
extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
-extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes);
struct bio_iostat_ctx {
@@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void);
extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
#else
-static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes) {}
static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index bf00d5057abb..a389772fd212 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -50,7 +50,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
if (S_ISDIR(inode->i_mode))
F2FS_I(inode)->i_current_depth = 1;
@@ -845,7 +845,7 @@ out:
}
static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool is_whiteout,
+ struct file *file, umode_t mode, bool is_whiteout,
struct inode **new_inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
@@ -892,8 +892,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
} else {
- if (dentry)
- d_tmpfile(dentry, inode);
+ if (file)
+ d_tmpfile(file, inode);
else
f2fs_i_links_write(inode, false);
}
@@ -915,16 +915,19 @@ out:
}
static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ int err;
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL);
+ err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL);
+
+ return finish_open_simple(file, err);
}
static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e06a0c478b39..983572f23896 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
__func__, nid);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
}
return 0;
@@ -585,7 +586,7 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
@@ -1295,6 +1296,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
err = -EFSCORRUPTED;
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
#endif
@@ -1369,7 +1371,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
err = f2fs_submit_page_bio(&fio);
if (!err)
- f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE);
return err;
}
@@ -2147,8 +2149,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
if (IS_INODE(&folio->page))
f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
#endif
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
set_page_private_reference(&folio->page);
return true;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dcd0a1e35095..dea95b48b647 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
- unsigned int offset;
+ unsigned int offset, ofs_in_node, max_addrs;
block_t bidx;
int i;
@@ -501,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
+ ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+
+ max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode);
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
+ ofs_in_node, dn->inode->i_ino, nid, max_addrs);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY);
+ return -EFSCORRUPTED;
+ }
+
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
} else if (dn->nid == nid) {
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
}
@@ -628,6 +638,7 @@ retry_dn:
inode->i_ino, ofs_of_node(dn.node_page),
ofs_of_node(page));
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
goto err;
}
@@ -640,12 +651,14 @@ retry_dn:
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
if (__is_valid_data_blkaddr(dest) &&
!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
@@ -698,6 +711,16 @@ retry_prev:
goto err;
}
+ if (f2fs_is_valid_blkaddr(sbi, dest,
+ DATA_GENERIC_ENHANCE_UPDATE)) {
+ f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
+ dest, inode->i_ino, dn.ofs_in_node);
+ err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
+ goto err;
+ }
+
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
ni.version, false, false);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0de21f82d7bc..acf3d3fa4363 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -187,7 +187,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
void f2fs_abort_atomic_write(struct inode *inode, bool clean)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
if (!f2fs_is_atomic_file(inode))
@@ -200,10 +199,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
fi->cow_inode = NULL;
release_atomic_write_cnt(inode);
clear_inode_flag(inode, FI_ATOMIC_FILE);
-
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- sbi->atomic_files--;
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ stat_dec_atomic_inode(inode);
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -312,6 +308,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
goto out;
}
@@ -376,7 +374,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
}
/* balance_fs_bg is able to be pending */
@@ -476,12 +474,12 @@ do_sync:
mutex_lock(&sbi->flush_lock);
blk_start_plug(&plug);
- f2fs_sync_dirty_inodes(sbi, FILE_INODE);
+ f2fs_sync_dirty_inodes(sbi, FILE_INODE, false);
blk_finish_plug(&plug);
mutex_unlock(&sbi->flush_lock);
}
- f2fs_sync_fs(sbi->sb, true);
+ f2fs_sync_fs(sbi->sb, 1);
stat_inc_bg_cp_count(sbi->stat_info);
}
@@ -694,7 +692,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
} while (ret && --count);
if (ret) {
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_FLUSH_FAIL);
break;
}
@@ -1171,7 +1170,7 @@ submit:
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, FS_DISCARD, 1);
+ f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1);
lstart += len;
start += len;
@@ -2535,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2589,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
- prandom_u32() % sbi->max_fragment_chunk + 1;
+ prandom_u32_max(sbi->max_fragment_chunk) + 1;
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2626,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
/* To allocate block chunks in different sizes, use random number */
if (--seg->fragment_remained_chunk <= 0) {
seg->fragment_remained_chunk =
- prandom_u32() % sbi->max_fragment_chunk + 1;
+ prandom_u32_max(sbi->max_fragment_chunk) + 1;
seg->next_blkoff +=
- prandom_u32() % sbi->max_fragment_hole + 1;
+ prandom_u32_max(sbi->max_fragment_hole) + 1;
}
}
}
@@ -3388,7 +3387,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
f2fs_submit_page_write(&fio);
stat_inc_meta_count(sbi, page->index);
- f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE);
}
void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
@@ -3398,7 +3397,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
set_summary(&sum, nid, 0, 0);
do_write_page(&sum, fio);
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE);
}
void f2fs_outplace_write_data(struct dnode_of_data *dn,
@@ -3412,7 +3411,7 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
- f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE);
}
int f2fs_inplace_write_data(struct f2fs_io_info *fio)
@@ -3432,6 +3431,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
__func__, segno);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
goto drop_bio;
}
@@ -3453,7 +3453,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
if (!err) {
f2fs_update_device_state(fio->sbi, fio->ino,
fio->new_blkaddr, 1);
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, fio->page->mapping->host,
+ fio->io_type, F2FS_BLKSIZE);
}
return err;
@@ -4379,6 +4380,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (se->type >= NR_PERSISTENT_LOG) {
f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
se->type, start);
+ f2fs_handle_error(sbi,
+ ERROR_INCONSISTENT_SUM_TYPE);
return -EFSCORRUPTED;
}
@@ -4415,6 +4418,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "Wrong journal entry on segno %u",
start);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL);
break;
}
@@ -4434,6 +4438,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
se->type, start);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
break;
}
@@ -4465,6 +4470,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
sit_valid_blocks[NODE], valid_node_count(sbi));
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT);
return -EFSCORRUPTED;
}
@@ -4473,6 +4479,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
sit_valid_blocks[DATA], sit_valid_blocks[NODE],
valid_user_blocks(sbi));
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT);
return -EFSCORRUPTED;
}
@@ -4623,6 +4630,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
f2fs_err(sbi,
"Current segment has invalid alloc_type:%d",
curseg->alloc_type);
+ f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
return -EFSCORRUPTED;
}
@@ -4640,6 +4648,7 @@ out:
"Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
i, curseg->segno, curseg->alloc_type,
curseg->next_blkoff, blkofs);
+ f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
return -EFSCORRUPTED;
}
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index d1d63766f2c7..be8f2d7d007b 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -753,6 +753,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
GET_SIT_VBLOCKS(raw_sit), valid_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT);
return -EFSCORRUPTED;
}
@@ -767,6 +768,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT);
return -EFSCORRUPTED;
}
return 0;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 26817b5aeac7..3834ead04620 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -301,10 +301,10 @@ static void f2fs_destroy_casefold_cache(void) { }
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count << 1) / 1000,
+ block_t limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
- /* limit is 0.2% */
+ /* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
F2FS_OPTION(sbi).root_reserved_blocks > limit) {
F2FS_OPTION(sbi).root_reserved_blocks = limit;
@@ -1342,6 +1342,11 @@ default_check:
return -EINVAL;
}
+ if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
+ f2fs_err(sbi, "LFS not compatible with ATGC");
+ return -EINVAL;
+ }
+
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1666,9 +1671,8 @@ static int f2fs_freeze(struct super_block *sb)
if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
return -EINVAL;
- /* ensure no checkpoint required */
- if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
- return -EINVAL;
+ /* Let's flush checkpoints and stop the thread. */
+ f2fs_flush_ckpt_thread(F2FS_SB(sb));
/* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
@@ -2181,6 +2185,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
+
+ /* Let's ensure there's no pending checkpoint anymore */
+ f2fs_flush_ckpt_thread(sbi);
}
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
@@ -2346,6 +2353,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
f2fs_stop_ckpt_thread(sbi);
need_restart_ckpt = true;
} else {
+ /* Flush if the prevous checkpoint, if exists. */
+ f2fs_flush_ckpt_thread(sbi);
+
err = f2fs_start_ckpt_thread(sbi);
if (err) {
f2fs_err(sbi,
@@ -2465,7 +2475,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
size_t toread;
loff_t i_size = i_size_read(inode);
struct page *page;
- char *kaddr;
if (off > i_size)
return 0;
@@ -2498,9 +2507,7 @@ repeat:
return -EIO;
}
- kaddr = kmap_atomic(page);
- memcpy(data, kaddr + offset, tocopy);
- kunmap_atomic(kaddr);
+ memcpy_from_page(data, page, offset, tocopy);
f2fs_put_page(page, 1);
offset = 0;
@@ -2522,7 +2529,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
size_t towrite = len;
struct page *page;
void *fsdata = NULL;
- char *kaddr;
int err = 0;
int tocopy;
@@ -2541,10 +2547,7 @@ retry:
break;
}
- kaddr = kmap_atomic(page);
- memcpy(kaddr + offset, data, tocopy);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, data, tocopy);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
page, fsdata);
@@ -3843,6 +3846,68 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
}
+void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ int err;
+
+ f2fs_down_write(&sbi->sb_lock);
+
+ if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1))
+ raw_super->s_stop_reason[reason]++;
+
+ err = f2fs_commit_super(sbi, false);
+ if (err)
+ f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d",
+ reason, err);
+ f2fs_up_write(&sbi->sb_lock);
+}
+
+static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
+{
+ spin_lock(&sbi->error_lock);
+ if (!test_bit(flag, (unsigned long *)sbi->errors)) {
+ set_bit(flag, (unsigned long *)sbi->errors);
+ sbi->error_dirty = true;
+ }
+ spin_unlock(&sbi->error_lock);
+}
+
+static bool f2fs_update_errors(struct f2fs_sb_info *sbi)
+{
+ bool need_update = false;
+
+ spin_lock(&sbi->error_lock);
+ if (sbi->error_dirty) {
+ memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors,
+ MAX_F2FS_ERRORS);
+ sbi->error_dirty = false;
+ need_update = true;
+ }
+ spin_unlock(&sbi->error_lock);
+
+ return need_update;
+}
+
+void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error)
+{
+ int err;
+
+ f2fs_save_errors(sbi, error);
+
+ f2fs_down_write(&sbi->sb_lock);
+
+ if (!f2fs_update_errors(sbi))
+ goto out_unlock;
+
+ err = f2fs_commit_super(sbi, false);
+ if (err)
+ f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
+ error, err);
+out_unlock:
+ f2fs_up_write(&sbi->sb_lock);
+}
+
static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -4190,6 +4255,9 @@ try_onemore:
goto free_devices;
}
+ spin_lock_init(&sbi->error_lock);
+ memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index eba5fb1629d7..df27afd71ef4 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -128,6 +128,12 @@ static ssize_t sb_status_show(struct f2fs_attr *a,
return sprintf(buf, "%lx\n", sbi->s_flag);
}
+static ssize_t cp_status_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
+}
+
static ssize_t pending_discard_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -527,7 +533,6 @@ out:
if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
spin_lock(&sbi->gc_urgent_high_lock);
- sbi->gc_urgent_high_limited = t != 0;
sbi->gc_urgent_high_remaining = t;
spin_unlock(&sbi->gc_urgent_high_lock);
@@ -1030,8 +1035,10 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTRIBUTE_GROUPS(f2fs_feat);
F2FS_GENERAL_RO_ATTR(sb_status);
+F2FS_GENERAL_RO_ATTR(cp_status);
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
+ ATTR_LIST(cp_status),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_stat);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 7b8f2b41c29b..c352fff88a5e 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *addr;
page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, page, offset_in_page(pos), n);
put_page(page);
@@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
void *fsdata;
- void *addr;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
@@ -246,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) {
f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr");
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_VERITY_XATTR);
return -EFSCORRUPTED;
}
if (buf_size) {
@@ -262,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c76c15086e5f..dc2e8637189e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto out;
}
check:
@@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto cleanup;
}
@@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto exit;
}
@@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
inode->i_ino, ENTRY_SIZE(last));
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto exit;
}
last = XATTR_NEXT_ENTRY(last);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 249825017da7..00235b8a1823 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx)
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
-static int func(struct dir_context *ctx, const char *name, int name_len, \
+static bool func(struct dir_context *ctx, const char *name, int name_len, \
loff_t offset, u64 ino, unsigned int d_type) \
{ \
struct fat_ioctl_filldir_callback *buf = \
@@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
struct dirent_type __user *d2 = d1 + 1; \
\
if (buf->result) \
- return -EINVAL; \
+ return false; \
buf->result++; \
\
if (name != NULL) { \
@@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
put_user(short_len, &d1->d_reclen)) \
goto efault; \
} \
- return 0; \
+ return true; \
efault: \
buf->result = -EFAULT; \
- return -EFAULT; \
+ return false; \
}
FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a38238d75c08..1cbcc4608dc7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
inode->i_generation &= ~1;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6630c69c23a2..f2bc27d1975e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -14,7 +14,7 @@
#include "internal.h"
#include "mount.h"
-static long do_sys_name_to_handle(struct path *path,
+static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
int __user *mnt_id)
{
diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..5f9c802a5d8d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
+EXPORT_SYMBOL(task_lookup_next_fd_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
diff --git a/fs/file_table.c b/fs/file_table.c
index 99c6796c9f28..dd88701e54a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -324,12 +324,7 @@ static void __fput(struct file *file)
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_dec(inode);
- if (mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(mnt);
- }
+ put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
dissolve_on_fput(mnt);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08a1993ab7fd..443f83382b9b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1718,9 +1718,14 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
- else if (!(inode->i_state & I_SYNC_QUEUED) &&
- (inode->i_state & I_DIRTY))
- redirty_tail_locked(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED)) {
+ if ((inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+ else if (inode->i_state & I_DIRTY_TIME) {
+ inode->dirtied_when = jiffies;
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ }
+ }
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
@@ -2370,6 +2375,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (flags & I_DIRTY_INODE) {
/*
+ * Inode timestamp update will piggback on this dirtying.
+ * We tell ->dirty_inode callback that timestamps need to
+ * be updated by setting I_DIRTY_TIME in flags.
+ */
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & I_DIRTY_TIME) {
+ inode->i_state &= ~I_DIRTY_TIME;
+ flags |= I_DIRTY_TIME;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
* Notify the filesystem about the inode being dirtied, so that
* (if needed) it can update on-disk fields and journal the
* inode. This is only needed when the inode itself is being
@@ -2378,7 +2397,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
+ sb->s_op->dirty_inode(inode,
+ flags & (I_DIRTY_INODE | I_DIRTY_TIME));
trace_writeback_dirty_inode(inode, flags);
/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
@@ -2399,21 +2419,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if (((inode->i_state & flags) == flags) ||
- (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+ if ((inode->i_state & flags) == flags)
return;
spin_lock(&inode->i_lock);
- if (dirtytime && (inode->i_state & I_DIRTY_INODE))
- goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
- /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
- if (flags & I_DIRTY_INODE)
- inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
/*
@@ -2486,7 +2500,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock:
if (wb)
spin_unlock(&wb->list_lock);
-out_unlock_inode:
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51897427a534..b4a6e0a1b945 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b585b04e815e..bb97a384dc5d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -529,7 +529,7 @@ out_err:
*/
static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct file *file, unsigned int flags,
- umode_t mode)
+ umode_t mode, u32 opcode)
{
int err;
struct inode *inode;
@@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
- args.opcode = FUSE_CREATE;
+ args.opcode = opcode;
args.nodeid = get_node_id(dir);
args.in_numargs = 2;
args.in_args[0].size = sizeof(inarg);
@@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fc->no_create)
goto mknod;
- err = fuse_create_open(dir, entry, file, flags, mode);
+ err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
@@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
}
+static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ int err;
+
+ if (fc->no_tmpfile)
+ return -EOPNOTSUPP;
+
+ err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ if (err == -ENOSYS) {
+ fc->no_tmpfile = 1;
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *entry, umode_t mode)
{
@@ -1913,6 +1930,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.setattr = fuse_setattr,
.create = fuse_create,
.atomic_open = fuse_atomic_open,
+ .tmpfile = fuse_tmpfile,
.mknod = fuse_mknod,
.permission = fuse_permission,
.getattr = fuse_getattr,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 488b460e046f..98a9cf531873 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -784,6 +784,9 @@ struct fuse_conn {
/* Does the filesystem support per inode DAX? */
unsigned int inode_dax:1;
+ /* Is tmpfile not implemented by fs? */
+ unsigned int no_tmpfile:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 756d05779200..cf40895233f5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -66,7 +66,7 @@ struct get_name_filldir {
char *name;
};
-static int get_name_filldir(struct dir_context *ctx, const char *name,
+static bool get_name_filldir(struct dir_context *ctx, const char *name,
int length, loff_t offset, u64 inum,
unsigned int type)
{
@@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name,
container_of(ctx, struct get_name_filldir, ctx);
if (inum != gnfd->inum.no_addr)
- return 0;
+ return true;
memcpy(gnfd->name, name, length);
gnfd->name[length] = 0;
- return 1;
+ return false;
}
static int gfs2_get_name(struct dentry *parent, char *name,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 892006fbbb09..60c6fb91fb58 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1443,6 +1443,22 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
}
+static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
+{
+ struct gfs2_glock *gl = fl_gh->gh_gl;
+
+ /*
+ * Make sure gfs2_glock_put() won't sleep under the file->f_lock
+ * spinlock.
+ */
+
+ gfs2_glock_hold(gl);
+ spin_lock(&file->f_lock);
+ gfs2_holder_uninit(fl_gh);
+ spin_unlock(&file->f_lock);
+ gfs2_glock_put(gl);
+}
+
static int do_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct gfs2_file *fp = file->private_data;
@@ -1455,7 +1471,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int sleeptime;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
+ flags = GL_EXACT | GL_NOPID;
+ if (!IS_SETLKW(cmd))
+ flags |= LM_FLAG_TRY_1CB;
mutex_lock(&fp->f_fl_mutex);
@@ -1474,18 +1492,21 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
&gfs2_flock_glops, CREATE, &gl);
if (error)
goto out;
+ spin_lock(&file->f_lock);
gfs2_holder_init(gl, state, flags, fl_gh);
+ spin_unlock(&file->f_lock);
gfs2_glock_put(gl);
}
for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
error = gfs2_glock_nq(fl_gh);
if (error != GLR_TRYFAILED)
break;
- fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+ fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB;
+ fl_gh->gh_flags |= LM_FLAG_TRY;
msleep(sleeptime);
}
if (error) {
- gfs2_holder_uninit(fl_gh);
+ __flock_holder_uninit(file, fl_gh);
if (error == GLR_TRYFAILED)
error = -EAGAIN;
} else {
@@ -1507,7 +1528,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
locks_lock_file_wait(file, fl);
if (gfs2_holder_initialized(fl_gh)) {
gfs2_glock_dq(fl_gh);
- gfs2_holder_uninit(fl_gh);
+ __flock_holder_uninit(file, fl_gh);
}
mutex_unlock(&fp->f_fl_mutex);
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 41b6c89e4bf7..df335c258eb0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -33,6 +33,9 @@
#include <linux/list_sort.h>
#include <linux/lockref.h>
#include <linux/rhashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
#include "gfs2.h"
#include "incore.h"
@@ -59,6 +62,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static void __gfs2_glock_dq(struct gfs2_holder *gh);
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+ unsigned long delay, bool remote);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
@@ -730,7 +735,8 @@ static bool is_system_glock(struct gfs2_glock *gl)
*
*/
-static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh,
+ unsigned int target)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
@@ -741,7 +747,8 @@ __acquires(&gl->gl_lockref.lock)
if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
gh && !(gh->gh_flags & LM_FLAG_NOEXP))
- return;
+ goto skip_inval;
+
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -826,6 +833,20 @@ skip_inval:
(target != LM_ST_UNLOCKED ||
test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
if (!is_system_glock(gl)) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */
+ /*
+ * Ordinarily, we would call dlm and its callback would call
+ * finish_xmote, which would call state_change() to the new state.
+ * Since we withdrew, we won't call dlm, so call state_change
+ * manually, but to the UNLOCKED state we desire.
+ */
+ state_change(gl, LM_ST_UNLOCKED);
+ /*
+ * We skip telling dlm to do the locking, so we won't get a
+ * reply that would otherwise clear GLF_LOCK. So we clear it here.
+ */
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
goto out;
} else {
@@ -1018,16 +1039,18 @@ static void delete_work_func(struct work_struct *work)
if (gfs2_queue_delete_work(gl, 5 * HZ))
return;
}
- goto out;
}
inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
GFS2_BLKST_UNLINKED);
- if (!IS_ERR_OR_NULL(inode)) {
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -EAGAIN &&
+ (gfs2_queue_delete_work(gl, 5 * HZ)))
+ return;
+ } else {
d_prune_aliases(inode);
iput(inode);
}
-out:
gfs2_glock_put(gl);
}
@@ -1436,6 +1459,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
va_end(args);
}
+static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
+{
+ if (!(gh->gh_flags & GL_NOPID))
+ return true;
+ if (gh->gh_state == LM_ST_UNLOCKED)
+ return true;
+ return false;
+}
+
/**
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
@@ -1472,10 +1504,17 @@ __acquires(&gl->gl_lockref.lock)
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
- if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
- !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
- goto trap_recursive;
+ if (likely(gh2->gh_owner_pid != gh->gh_owner_pid))
+ continue;
+ if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK)
+ continue;
+ if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))
+ continue;
+ if (!pid_is_meaningful(gh2))
+ continue;
+ goto trap_recursive;
+ }
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
fail:
@@ -2194,6 +2233,20 @@ static void dump_glock_func(struct gfs2_glock *gl)
dump_glock(NULL, gl, true);
}
+static void withdraw_dq(struct gfs2_glock *gl)
+{
+ spin_lock(&gl->gl_lockref.lock);
+ if (!__lockref_is_dead(&gl->gl_lockref) &&
+ glock_blocked_by_withdraw(gl))
+ do_error(gl, LM_OUT_ERROR); /* remove pending waiters */
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+void gfs2_gl_dq_holders(struct gfs2_sbd *sdp)
+{
+ glock_hash_walk(withdraw_dq, sdp);
+}
+
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @sdp: the filesystem
@@ -2272,19 +2325,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh,
const char *fs_id_buf)
{
- struct task_struct *gh_owner = NULL;
+ const char *comm = "(none)";
+ pid_t owner_pid = 0;
char flags_buf[32];
rcu_read_lock();
- if (gh->gh_owner_pid)
+ if (pid_is_meaningful(gh)) {
+ struct task_struct *gh_owner;
+
+ comm = "(ended)";
+ owner_pid = pid_nr(gh->gh_owner_pid);
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
+ if (gh_owner)
+ comm = gh_owner->comm;
+ }
gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
fs_id_buf, state2str(gh->gh_state),
hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
- gh->gh_error,
- gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
- gh_owner ? gh_owner->comm : "(ended)",
- (void *)gh->gh_ip);
+ gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip);
rcu_read_unlock();
}
@@ -2699,6 +2757,172 @@ static const struct file_operations gfs2_glstats_fops = {
.release = gfs2_glocks_release,
};
+struct gfs2_glockfd_iter {
+ struct super_block *sb;
+ unsigned int tgid;
+ struct task_struct *task;
+ unsigned int fd;
+ struct file *file;
+};
+
+static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct pid *pid;
+
+ if (i->task)
+ put_task_struct(i->task);
+
+ rcu_read_lock();
+retry:
+ i->task = NULL;
+ pid = find_ge_pid(i->tgid, ns);
+ if (pid) {
+ i->tgid = pid_nr_ns(pid, ns);
+ i->task = pid_task(pid, PIDTYPE_TGID);
+ if (!i->task) {
+ i->tgid++;
+ goto retry;
+ }
+ get_task_struct(i->task);
+ }
+ rcu_read_unlock();
+ return i->task;
+}
+
+static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
+{
+ if (i->file) {
+ fput(i->file);
+ i->file = NULL;
+ }
+
+ rcu_read_lock();
+ for(;; i->fd++) {
+ struct inode *inode;
+
+ i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ if (!i->file) {
+ i->fd = 0;
+ break;
+ }
+ inode = file_inode(i->file);
+ if (inode->i_sb != i->sb)
+ continue;
+ if (get_file_rcu(i->file))
+ break;
+ }
+ rcu_read_unlock();
+ return i->file;
+}
+
+static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ if (*pos)
+ return NULL;
+ while (gfs2_glockfd_next_task(i)) {
+ if (gfs2_glockfd_next_file(i))
+ return i;
+ i->tgid++;
+ }
+ return NULL;
+}
+
+static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr,
+ loff_t *pos)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ (*pos)++;
+ i->fd++;
+ do {
+ if (gfs2_glockfd_next_file(i))
+ return i;
+ i->tgid++;
+ } while (gfs2_glockfd_next_task(i));
+ return NULL;
+}
+
+static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ if (i->file)
+ fput(i->file);
+ if (i->task)
+ put_task_struct(i->task);
+}
+
+static void gfs2_glockfd_seq_show_flock(struct seq_file *seq,
+ struct gfs2_glockfd_iter *i)
+{
+ struct gfs2_file *fp = i->file->private_data;
+ struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+ struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED };
+
+ if (!READ_ONCE(fl_gh->gh_gl))
+ return;
+
+ spin_lock(&i->file->f_lock);
+ if (gfs2_holder_initialized(fl_gh))
+ gl_name = fl_gh->gh_gl->gl_name;
+ spin_unlock(&i->file->f_lock);
+
+ if (gl_name.ln_type != LM_TYPE_RESERVED) {
+ seq_printf(seq, "%d %u %u/%llx\n",
+ i->tgid, i->fd, gl_name.ln_type,
+ (unsigned long long)gl_name.ln_number);
+ }
+}
+
+static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+ struct inode *inode = file_inode(i->file);
+ struct gfs2_glock *gl;
+
+ inode_lock_shared(inode);
+ gl = GFS2_I(inode)->i_iopen_gh.gh_gl;
+ if (gl) {
+ seq_printf(seq, "%d %u %u/%llx\n",
+ i->tgid, i->fd, gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ }
+ gfs2_glockfd_seq_show_flock(seq, i);
+ inode_unlock_shared(inode);
+ return 0;
+}
+
+static const struct seq_operations gfs2_glockfd_seq_ops = {
+ .start = gfs2_glockfd_seq_start,
+ .next = gfs2_glockfd_seq_next,
+ .stop = gfs2_glockfd_seq_stop,
+ .show = gfs2_glockfd_seq_show,
+};
+
+static int gfs2_glockfd_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_glockfd_iter *i;
+ struct gfs2_sbd *sdp = inode->i_private;
+
+ i = __seq_open_private(file, &gfs2_glockfd_seq_ops,
+ sizeof(struct gfs2_glockfd_iter));
+ if (!i)
+ return -ENOMEM;
+ i->sb = sdp->sd_vfs;
+ return 0;
+}
+
+static const struct file_operations gfs2_glockfd_fops = {
+ .owner = THIS_MODULE,
+ .open = gfs2_glockfd_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats);
void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
@@ -2708,6 +2932,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glocks_fops);
+ debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glockfd_fops);
+
debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glstats_fops);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 5aed8b500cf5..0d068f4fd7d6 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -91,6 +91,7 @@ enum {
#define GL_ASYNC 0x0040
#define GL_EXACT 0x0080
#define GL_SKIP 0x0100
+#define GL_NOPID 0x0200
#define GL_NOCACHE 0x0400
/*
@@ -274,6 +275,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c8ec876f33ea..04a201584fa7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_glock *io_gl;
+ int extra_flags = 0;
error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE,
&ip->i_gl);
@@ -141,9 +142,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail;
- if (blktype != GFS2_BLKST_UNLINKED)
+ if (blktype == GFS2_BLKST_UNLINKED)
+ extra_flags |= LM_FLAG_TRY;
+ else
gfs2_cancel_delete_work(io_gl);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT,
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED,
+ GL_EXACT | GL_NOPID | extra_flags,
&ip->i_iopen_gh);
gfs2_glock_put(io_gl);
if (unlikely(error))
@@ -210,6 +214,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail:
+ if (error == GLR_TRYFAILED)
+ error = -EAGAIN;
if (gfs2_holder_initialized(&ip->i_iopen_gh))
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
if (gfs2_holder_initialized(&i_gh))
@@ -720,7 +726,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
BUG_ON(error);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID,
+ &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 14ae9de76277..afcb32854f14 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void)
if (error)
goto fail_shrinker;
- error = register_filesystem(&gfs2_fs_type);
- if (error)
- goto fail_fs1;
-
- error = register_filesystem(&gfs2meta_fs_type);
- if (error)
- goto fail_fs2;
-
error = -ENOMEM;
gfs_recovery_wq = alloc_workqueue("gfs_recovery",
WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
@@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void)
goto fail_mempool;
gfs2_register_debugfs();
+ error = register_filesystem(&gfs2_fs_type);
+ if (error)
+ goto fail_fs1;
+
+ error = register_filesystem(&gfs2meta_fs_type);
+ if (error)
+ goto fail_fs2;
+
pr_info("GFS2 installed\n");
return 0;
+fail_fs2:
+ unregister_filesystem(&gfs2_fs_type);
+fail_fs1:
+ mempool_destroy(gfs2_page_pool);
fail_mempool:
destroy_workqueue(gfs2_freeze_wq);
fail_wq3:
@@ -192,10 +196,6 @@ fail_wq3:
fail_wq2:
destroy_workqueue(gfs_recovery_wq);
fail_wq1:
- unregister_filesystem(&gfs2meta_fs_type);
-fail_fs2:
- unregister_filesystem(&gfs2_fs_type);
-fail_fs1:
unregister_shrinker(&gfs2_qd_shrinker);
fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 7e70e0ba5a6c..6ed728aae9a5 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -525,8 +525,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
- if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh);
+ bh_read_nowait(first_bh, REQ_META | REQ_PRIO);
dblock++;
extlen--;
@@ -534,9 +533,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
while (extlen) {
bh = gfs2_getbuf(gl, dblock, CREATE);
- if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META |
- REQ_PRIO, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 549879929c84..c0cf1d2d0ef5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
pr_warn("Invalid block size\n");
return -EINVAL;
}
-
+ if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) {
+ pr_warn("Invalid block size shift\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -381,8 +384,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
if (!table[0])
table = sdp->sd_vfs->s_id;
- strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
- strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+ BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN);
+
+ strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN);
+ strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN);
table = sdp->sd_table_name;
while ((table = strchr(table, '/')))
@@ -401,7 +406,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
error = gfs2_glock_nq_num(sdp,
GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
- LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+ LM_ST_EXCLUSIVE,
+ LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID,
mount_gh);
if (error) {
fs_err(sdp, "can't acquire mount glock: %d\n", error);
@@ -411,7 +417,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
error = gfs2_glock_nq_num(sdp,
GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT,
+ LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
&sdp->sd_live_gh);
if (error) {
fs_err(sdp, "can't acquire live glock: %d\n", error);
@@ -687,7 +693,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
iput(pn);
pn = NULL;
ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID,
&sdp->sd_sc_gh);
if (error) {
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
@@ -776,7 +782,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
&gfs2_journal_glops,
LM_ST_EXCLUSIVE,
- LM_FLAG_NOEXP | GL_NOCACHE,
+ LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID,
&sdp->sd_journal_gh);
if (error) {
fs_err(sdp, "can't acquire journal glock: %d\n", error);
@@ -786,7 +792,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
ip = GFS2_I(sdp->sd_jdesc->jd_inode);
sdp->sd_jinode_gl = ip->i_gl;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
+ LM_FLAG_NOEXP | GL_EXACT |
+ GL_NOCACHE | GL_NOPID,
&sdp->sd_jinode_gh);
if (error) {
fs_err(sdp, "can't acquire journal inode glock: %d\n",
@@ -957,7 +964,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
pn = NULL;
ip = GFS2_I(sdp->sd_qc_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -1439,13 +1446,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (o) {
case Opt_lockproto:
- strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_locktable:
- strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_hostdata:
- strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_spectator:
args->ar_spectator = 1;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f201eaf59d0d..1ed17226d9ed 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
}
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- goto unlock_out;
- }
+ if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
+ goto unlock_out;
if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
else
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b5b0f285b27f..b018957a1bb2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
}
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_NOEXP, &sdp->sd_freeze_gh);
+ LM_FLAG_NOEXP | GL_NOPID,
+ &sdp->sd_freeze_gh);
if (error)
goto out;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 8241029a2a5d..7a6aeffcdf5c 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
}
if (!ret)
gfs2_make_fs_ro(sdp);
+ /*
+ * Dequeue any pending non-system glock holders that can no
+ * longer be granted because the file system is withdrawn.
+ */
+ gfs2_gl_dq_holders(sdp);
gfs2_freeze_unlock(&freeze_gh);
}
@@ -204,6 +209,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
* exception code in glock_dq.
*/
iput(inode);
+ sdp->sd_jdesc->jd_inode = NULL;
/*
* Wait until the journal inode's glock is freed. This allows try locks
* on other nodes to be successful, otherwise we remain the owner of
@@ -226,7 +232,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
*/
fs_warn(sdp, "Requesting recovery of jid %d.\n",
sdp->sd_lockstruct.ls_jid);
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP,
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE,
+ LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID,
&sdp->sd_live_gh);
msleep(GL_GLOCK_MAX_HOLD);
/*
@@ -251,7 +258,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
fs_warn(sdp, "Unable to recover our journal jid %d.\n",
sdp->sd_lockstruct.ls_jid);
gfs2_glock_dq_wait(&sdp->sd_live_gh);
- gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
+ gfs2_holder_reinit(LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
&sdp->sd_live_gh);
gfs2_glock_nq(&sdp->sd_live_gh);
}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index c83fd0e8404d..2015e42e752a 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int pagenum;
int bytes_read;
int bytes_to_read;
- void *vaddr;
off += node->page_offset;
pagenum = off >> PAGE_SHIFT;
@@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
page = node->page[pagenum];
bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
- vaddr = kmap_atomic(page);
- memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
- kunmap_atomic(vaddr);
+ memcpy_from_page(buf + bytes_read, page, off, bytes_to_read);
pagenum++;
off = 0; /* page offset only applies to the first page */
@@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
off += node->page_offset;
page = node->page[0];
- memcpy(kmap(page) + off, buf, len);
- kunmap(page);
+ memcpy_to_page(page, off, buf, len);
set_page_dirty(page);
}
@@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
off += node->page_offset;
page = node->page[0];
- memset(kmap(page) + off, 0, len);
- kunmap(page);
+ memzero_page(page, off, len);
set_page_dirty(page);
}
@@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
src_page = src_node->page[0];
dst_page = dst_node->page[0];
- memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len);
- kunmap(src_page);
- kunmap(dst_page);
+ memcpy_page(dst_page, dst, src_page, src, len);
set_page_dirty(dst_page);
}
@@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
src += node->page_offset;
dst += node->page_offset;
page = node->page[0];
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
memmove(ptr + dst, ptr + src, len);
- kunmap(page);
+ kunmap_local(ptr);
set_page_dirty(page);
}
@@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
if (!test_bit(HFS_BNODE_NEW, &node->flags))
return node;
- desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+ desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
+ node->page_offset);
node->prev = be32_to_cpu(desc->prev);
node->next = be32_to_cpu(desc->next);
node->num_recs = be16_to_cpu(desc->num_recs);
node->type = desc->type;
node->height = desc->height;
- kunmap(node->page[0]);
+ kunmap_local(desc);
switch (node->type) {
case HFS_NODE_HEADER:
@@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
}
pagep = node->page;
- memset(kmap(*pagep) + node->page_offset, 0,
- min((int)PAGE_SIZE, (int)tree->node_size));
+ memzero_page(*pagep, node->page_offset,
+ min((int)PAGE_SIZE, (int)tree->node_size));
set_page_dirty(*pagep);
- kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_SIZE);
+ memzero_page(*++pagep, 0, PAGE_SIZE);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
clear_bit(HFS_BNODE_NEW, &node->flags);
wake_up(&node->lock_wq);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 19017d296173..2fa4b1f8cc7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
goto free_inode;
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -119,11 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- kunmap(page);
+ kunmap_local(head);
put_page(page);
return tree;
fail_page:
+ kunmap_local(head);
put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
@@ -169,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
return;
/* Load the header */
page = node->page[0];
- head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ sizeof(struct hfs_bnode_desc));
head->root = cpu_to_be32(tree->root);
head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -180,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree)
head->attributes = cpu_to_be32(tree->attributes);
head->depth = cpu_to_be16(tree->depth);
- kunmap(page);
+ kunmap_local(head);
set_page_dirty(page);
hfs_bnode_put(node);
}
@@ -268,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
idx = 0;
@@ -281,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
idx += i;
data[off] |= m;
set_page_dirty(*pagep);
- kunmap(*pagep);
+ kunmap_local(data);
tree->free_nodes--;
mark_inode_dirty(tree->inode);
hfs_bnode_put(node);
@@ -290,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
if (++off >= PAGE_SIZE) {
- kunmap(*pagep);
- data = kmap(*++pagep);
+ kunmap_local(data);
+ data = kmap_local_page(*++pagep);
off = 0;
}
idx += 8;
len--;
}
- kunmap(*pagep);
+ kunmap_local(data);
nidx = node->next;
if (!nidx) {
printk(KERN_DEBUG "create new bmap node...\n");
@@ -313,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
}
}
@@ -360,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node)
}
off += node->page_offset + nidx / 8;
page = node->page[off >> PAGE_SHIFT];
- data = kmap(page);
+ data = kmap_local_page(page);
off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
pr_crit("trying to free free bnode %u(%d)\n",
node->this, node->type);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
return;
}
data[off] = byte & ~m;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
tree->free_nodes++;
mark_inode_dirty(tree->inode);
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index cebce0cfe340..bd8dcea85588 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
i = offset % 32;
offset &= ~(PAGE_CACHE_BITS - 1);
@@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
}
curr++;
}
- kunmap(page);
+ kunmap_local(pptr);
offset += PAGE_CACHE_BITS;
if (offset >= size)
break;
@@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
- curr = pptr = kmap(page);
+ curr = pptr = kmap_local_page(page);
if ((size ^ offset) / PAGE_CACHE_BITS)
end = pptr + PAGE_CACHE_BITS / 32;
else
@@ -127,7 +127,7 @@ found:
len -= 32;
}
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
offset += PAGE_CACHE_BITS;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
NULL);
@@ -135,7 +135,7 @@ found:
start = size;
goto out;
}
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -151,7 +151,7 @@ last:
done:
*curr = cpu_to_be32(n);
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
*max = offset + (curr - pptr) * 32 + i - start;
sbi->free_blocks -= *max;
hfsplus_mark_mdb_dirty(sb);
@@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
page = read_mapping_page(mapping, pnr, NULL);
if (IS_ERR(page))
goto kaboom;
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
end = pptr + PAGE_CACHE_BITS / 32;
len = count;
@@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
if (!count)
break;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
page = read_mapping_page(mapping, ++pnr, NULL);
if (IS_ERR(page))
goto kaboom;
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -231,7 +231,7 @@ done:
}
out:
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
sbi->free_blocks += len;
hfsplus_mark_mdb_dirty(sb);
mutex_unlock(&sbi->alloc_mutex);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index a5ab00e54220..87974d5e6791 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memcpy(buf, kmap(*pagep) + off, l);
- kunmap(*pagep);
+ memcpy_from_page(buf, *pagep, off, l);
while ((len -= l) != 0) {
buf += l;
l = min_t(int, len, PAGE_SIZE);
- memcpy(buf, kmap(*++pagep), l);
- kunmap(*pagep);
+ memcpy_from_page(buf, *++pagep, 0, l);
}
}
@@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memcpy(kmap(*pagep) + off, buf, l);
+ memcpy_to_page(*pagep, off, buf, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
l = min_t(int, len, PAGE_SIZE);
- memcpy(kmap(*++pagep), buf, l);
+ memcpy_to_page(*++pagep, 0, buf, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
}
@@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memset(kmap(*pagep) + off, 0, l);
+ memzero_page(*pagep, off, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memset(kmap(*++pagep), 0, l);
+ memzero_page(*++pagep, 0, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
}
@@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
if (src == dst) {
l = min_t(int, len, PAGE_SIZE - src);
- memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
- kunmap(*src_page);
+ memcpy_page(*dst_page, src, *src_page, src, l);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memcpy(kmap(*++dst_page), kmap(*++src_page), l);
- kunmap(*src_page);
+ memcpy_page(*++dst_page, 0, *++src_page, 0, l);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
}
} else {
void *src_ptr, *dst_ptr;
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (PAGE_SIZE - src < PAGE_SIZE - dst) {
l = PAGE_SIZE - src;
src = 0;
@@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
}
l = min(len, l);
memcpy(dst_ptr, src_ptr, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (!dst)
dst_page++;
else
@@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
{
struct page **src_page, **dst_page;
+ void *src_ptr, *dst_ptr;
int l;
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
@@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
if (src == dst) {
while (src < len) {
- memmove(kmap(*dst_page), kmap(*src_page), src);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*dst_page);
+ src_ptr = kmap_local_page(*src_page);
+ memmove(dst_ptr, src_ptr, src);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
len -= src;
src = PAGE_SIZE;
src_page--;
dst_page--;
}
src -= len;
- memmove(kmap(*dst_page) + src,
- kmap(*src_page) + src, len);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*dst_page);
+ src_ptr = kmap_local_page(*src_page);
+ memmove(dst_ptr + src, src_ptr + src, len);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
} else {
- void *src_ptr, *dst_ptr;
-
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (src < dst) {
l = src;
src = PAGE_SIZE;
@@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
}
l = min(len, l);
memmove(dst_ptr - l, src_ptr - l, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (dst == PAGE_SIZE)
dst_page--;
else
@@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
if (src == dst) {
l = min_t(int, len, PAGE_SIZE - src);
- memmove(kmap(*dst_page) + src,
- kmap(*src_page) + src, l);
- kunmap(*src_page);
+
+ dst_ptr = kmap_local_page(*dst_page) + src;
+ src_ptr = kmap_local_page(*src_page) + src;
+ memmove(dst_ptr, src_ptr, l);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memmove(kmap(*++dst_page),
- kmap(*++src_page), l);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*++dst_page);
+ src_ptr = kmap_local_page(*++src_page);
+ memmove(dst_ptr, src_ptr, l);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
}
} else {
- void *src_ptr, *dst_ptr;
-
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (PAGE_SIZE - src <
PAGE_SIZE - dst) {
l = PAGE_SIZE - src;
@@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
}
l = min(len, l);
memmove(dst_ptr, src_ptr, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (!dst)
dst_page++;
else
@@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
if (!test_bit(HFS_BNODE_NEW, &node->flags))
return node;
- desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
- node->page_offset);
+ desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
+ node->page_offset);
node->prev = be32_to_cpu(desc->prev);
node->next = be32_to_cpu(desc->next);
node->num_recs = be16_to_cpu(desc->num_recs);
node->type = desc->type;
node->height = desc->height;
- kunmap(node->page[0]);
+ kunmap_local(desc);
switch (node->type) {
case HFS_NODE_HEADER:
@@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
}
pagep = node->page;
- memset(kmap(*pagep) + node->page_offset, 0,
- min_t(int, PAGE_SIZE, tree->node_size));
+ memzero_page(*pagep, node->page_offset,
+ min_t(int, PAGE_SIZE, tree->node_size));
set_page_dirty(*pagep);
- kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_SIZE);
+ memzero_page(*++pagep, 0, PAGE_SIZE);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
clear_bit(HFS_BNODE_NEW, &node->flags);
wake_up(&node->lock_wq);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 66774f4cb4fd..9e1732a2b92a 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
goto free_inode;
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
@@ -240,11 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
(tree->node_size + PAGE_SIZE - 1) >>
PAGE_SHIFT;
- kunmap(page);
+ kunmap_local(head);
put_page(page);
return tree;
fail_page:
+ kunmap_local(head);
put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfsplus_aops;
@@ -291,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree)
return -EIO;
/* Load the header */
page = node->page[0];
- head = (struct hfs_btree_header_rec *)(kmap(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
sizeof(struct hfs_bnode_desc));
head->root = cpu_to_be32(tree->root);
@@ -303,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree)
head->attributes = cpu_to_be32(tree->attributes);
head->depth = cpu_to_be16(tree->depth);
- kunmap(page);
+ kunmap_local(head);
set_page_dirty(page);
hfs_bnode_put(node);
return 0;
@@ -394,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
idx = 0;
@@ -407,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
idx += i;
data[off] |= m;
set_page_dirty(*pagep);
- kunmap(*pagep);
+ kunmap_local(data);
tree->free_nodes--;
mark_inode_dirty(tree->inode);
hfs_bnode_put(node);
@@ -417,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
if (++off >= PAGE_SIZE) {
- kunmap(*pagep);
- data = kmap(*++pagep);
+ kunmap_local(data);
+ data = kmap_local_page(*++pagep);
off = 0;
}
idx += 8;
len--;
}
- kunmap(*pagep);
+ kunmap_local(data);
nidx = node->next;
if (!nidx) {
hfs_dbg(BNODE_MOD, "create new bmap node\n");
@@ -440,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
}
}
@@ -490,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
}
off += node->page_offset + nidx / 8;
page = node->page[off >> PAGE_SHIFT];
- data = kmap(page);
+ data = kmap_local_page(page);
off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
@@ -498,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node)
pr_crit("trying to free free bnode "
"%u(%d)\n",
node->this, node->type);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
return;
}
data[off] = byte & ~m;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
tree->free_nodes++;
mark_inode_dirty(tree->inode);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 07881b76d42f..277468783fee 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
*/
BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
- strlcpy(name, root, PATH_MAX);
+ strscpy(name, root, PATH_MAX);
if (len > p - name) {
__putname(name);
return NULL;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f7a5b5124d8a..dd54f67e47fd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -364,13 +364,155 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void remove_huge_page(struct page *page)
+static void hugetlb_delete_from_page_cache(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
+/*
+ * Called with i_mmap_rwsem held for inode based vma maps. This makes
+ * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
+ * mutex for the page in the mapping. So, we can not race with page being
+ * faulted into the vma.
+ */
+static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
+ unsigned long addr, struct page *page)
+{
+ pte_t *ptep, pte;
+
+ ptep = huge_pte_offset(vma->vm_mm, addr,
+ huge_page_size(hstate_vma(vma)));
+
+ if (!ptep)
+ return false;
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte) || !pte_present(pte))
+ return false;
+
+ if (pte_page(pte) == page)
+ return true;
+
+ return false;
+}
+
+/*
+ * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
+{
+ if (vma->vm_pgoff < start)
+ return (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ return 0;
+}
+
+static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
+{
+ unsigned long t_end;
+
+ if (!end)
+ return vma->vm_end;
+
+ t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
+ if (t_end > vma->vm_end)
+ t_end = vma->vm_end;
+ return t_end;
+}
+
+/*
+ * Called with hugetlb fault mutex held. Therefore, no more mappings to
+ * this folio can be created while executing the routine.
+ */
+static void hugetlb_unmap_file_folio(struct hstate *h,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index)
+{
+ struct rb_root_cached *root = &mapping->i_mmap;
+ struct hugetlb_vma_lock *vma_lock;
+ struct page *page = &folio->page;
+ struct vm_area_struct *vma;
+ unsigned long v_start;
+ unsigned long v_end;
+ pgoff_t start, end;
+
+ start = index * pages_per_huge_page(h);
+ end = (index + 1) * pages_per_huge_page(h);
+
+ i_mmap_lock_write(mapping);
+retry:
+ vma_lock = NULL;
+ vma_interval_tree_foreach(vma, root, start, end - 1) {
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ continue;
+
+ if (!hugetlb_vma_trylock_write(vma)) {
+ vma_lock = vma->vm_private_data;
+ /*
+ * If we can not get vma lock, we need to drop
+ * immap_sema and take locks in order. First,
+ * take a ref on the vma_lock structure so that
+ * we can be guaranteed it will not go away when
+ * dropping immap_sema.
+ */
+ kref_get(&vma_lock->refs);
+ break;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, ZAP_FLAG_DROP_MARKER);
+ hugetlb_vma_unlock_write(vma);
+ }
+
+ i_mmap_unlock_write(mapping);
+
+ if (vma_lock) {
+ /*
+ * Wait on vma_lock. We know it is still valid as we have
+ * a reference. We must 'open code' vma locking as we do
+ * not know if vma_lock is still attached to vma.
+ */
+ down_write(&vma_lock->rw_sema);
+ i_mmap_lock_write(mapping);
+
+ vma = vma_lock->vma;
+ if (!vma) {
+ /*
+ * If lock is no longer attached to vma, then just
+ * unlock, drop our reference and retry looking for
+ * other vmas.
+ */
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ goto retry;
+ }
+
+ /*
+ * vma_lock is still attached to vma. Check to see if vma
+ * still maps page and if so, unmap.
+ */
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+ if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ unmap_hugepage_range(vma, vma->vm_start + v_start,
+ v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ hugetlb_vma_unlock_write(vma);
+
+ goto retry;
+ }
+}
+
static void
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
zap_flags_t zap_flags)
@@ -383,32 +525,66 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* an inclusive "last".
*/
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
- unsigned long v_offset;
+ unsigned long v_start;
unsigned long v_end;
+ if (!hugetlb_vma_trylock_write(vma))
+ continue;
+
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, zap_flags);
+
/*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
+ * Note that vma lock only exists for shared/non-private
+ * vmas. Therefore, lock is not held when calling
+ * unmap_hugepage_range for private vmas.
*/
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (!end)
- v_end = vma->vm_end;
- else {
- v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
- + vma->vm_start;
- if (v_end > vma->vm_end)
- v_end = vma->vm_end;
- }
+ hugetlb_vma_unlock_write(vma);
+ }
+}
- unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL, zap_flags);
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ bool truncate_op)
+{
+ bool ret = false;
+
+ /*
+ * If folio is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) while holding
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the folio.
+ */
+ if (unlikely(folio_mapped(folio)))
+ hugetlb_unmap_file_folio(h, mapping, folio, index);
+
+ folio_lock(folio);
+ /*
+ * We must remove the folio from page cache before removing
+ * the region/ reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the region/reserve
+ * map could fail. Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON(HPageRestoreReserve(&folio->page));
+ hugetlb_delete_from_page_cache(&folio->page);
+ ret = true;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode, index,
+ index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
}
+
+ folio_unlock(folio);
+ return ret;
}
/*
@@ -418,10 +594,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
- * page faults in the truncated range by checking i_size. i_size is
- * modified while holding i_mmap_rwsem.
+ * maps and global counts. Page faults can race with truncation.
+ * During faults, hugetlb_no_page() checks i_size before page allocation,
+ * and again after obtaining page table lock. It will 'back out'
+ * allocations in the truncated range.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserve map
@@ -451,61 +627,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash = 0;
index = folio->index;
- if (!truncate_op) {
- /*
- * Only need to hold the fault mutex in the
- * hole punch case. This prevents races with
- * page faults. Races are not possible in the
- * case of truncation.
- */
- hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- }
+ hash = hugetlb_fault_mutex_hash(mapping, index);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
- * If folio is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the folio.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * Remove folio that was part of folio_batch.
*/
- if (unlikely(folio_mapped(folio))) {
- BUG_ON(truncate_op);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_lock_write(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h),
- ZAP_FLAG_DROP_MARKER);
- i_mmap_unlock_write(mapping);
- }
-
- folio_lock(folio);
- /*
- * We must free the huge page and remove from page
- * cache (remove_huge_page) BEFORE removing the
- * region/reserve map (hugetlb_unreserve_pages). In
- * rare out of memory conditions, removal of the
- * region/reserve map could fail. Correspondingly,
- * the subpool and global reserve usage count can need
- * to be adjusted.
- */
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- remove_huge_page(&folio->page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
- index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
- }
-
- folio_unlock(folio);
- if (!truncate_op)
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -543,8 +675,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
- i_mmap_lock_write(mapping);
i_size_write(inode, offset);
+ i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
ZAP_FLAG_DROP_MARKER);
@@ -703,11 +835,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /*
- * fault mutex taken here, protects against fault path
- * and hole punch. inode_lock previously taken protects
- * against truncation.
- */
+ /* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -737,7 +865,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
}
clear_huge_page(page, addr, pages_per_huge_page(h));
__SetPageUptodate(page);
- error = huge_add_to_page_cache(page, mapping, index);
+ error = hugetlb_add_to_page_cache(page, mapping, index);
if (unlikely(error)) {
restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page);
@@ -749,7 +877,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
SetHPageMigratable(page);
/*
- * unlock_page because locked by huge_add_to_page_cache()
+ * unlock_page because locked by hugetlb_add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
@@ -885,33 +1013,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int do_hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t dev,
- bool tmpfile)
+static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
- if (inode) {
- dir->i_ctime = dir->i_mtime = current_time(dir);
- if (tmpfile) {
- d_tmpfile(dentry, inode);
- } else {
- d_instantiate(dentry, inode);
- dget(dentry);/* Extra count - pin the dentry in core */
- }
- error = 0;
- }
- return error;
-}
-
-static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
-{
- return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ return 0;
}
static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
@@ -932,10 +1045,17 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns,
}
static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
- struct inode *dir, struct dentry *dentry,
+ struct inode *dir, struct file *file,
umode_t mode)
{
- return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+ struct inode *inode;
+
+ inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
@@ -994,7 +1114,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct inode *inode = mapping->host;
pgoff_t index = page->index;
- remove_huge_page(page);
+ hugetlb_delete_from_page_cache(page);
if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
hugetlb_fix_reserve_counts(inode);
diff --git a/fs/inode.c b/fs/inode.c
index ba1de23c13c1..b608528efd3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (unlikely(security_inode_alloc(inode)))
+ return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
-out:
- return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
diff --git a/fs/internal.h b/fs/internal.h
index 3e206d3e317c..6f0386b34fae 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
-int may_linkat(struct user_namespace *mnt_userns, struct path *link);
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
@@ -102,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+static inline void put_file_access(struct file *file)
+{
+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_dec(file->f_inode);
+ } else if (file->f_mode & FMODE_WRITER) {
+ put_write_access(file->f_inode);
+ __mnt_drop_write(file->f_path.mnt);
+ }
+}
+
/*
* super.c
*/
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ca5c62901541..91ee0b308e13 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1360,6 +1360,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
error = wpc->ops->map_blocks(wpc, inode, pos);
if (error)
break;
+ trace_iomap_writepage_map(inode, &wpc->iomap);
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
continue;
if (wpc->iomap.type == IOMAP_HOLE)
@@ -1421,7 +1422,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (!count)
folio_end_writeback(folio);
done:
- mapping_set_error(folio->mapping, error);
+ mapping_set_error(inode->i_mapping, error);
return error;
}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index d48868fc40d7..f6ea9540d082 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -148,6 +148,7 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
+DEFINE_IOMAP_EVENT(iomap_writepage_map);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index b466172eec25..c4da3f634b92 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
for ( i = 0 ; i < pcount ; i++ ) {
if (!pages[i])
continue;
- memset(page_address(pages[i]), 0, PAGE_SIZE);
- flush_dcache_page(pages[i]);
+ memzero_page(pages[i], 0, PAGE_SIZE);
SetPageUptodate(pages[i]);
}
return ((loff_t)pcount) << PAGE_SHIFT;
@@ -82,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
return 0;
}
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
- ll_rw_block(REQ_OP_READ, haveblocks, bhs);
+ bh_read_batch(haveblocks, bhs);
curbh = 0;
curpage = 0;
@@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
zerr != Z_STREAM_END) {
if (!stream.avail_out) {
if (pages[curpage]) {
- stream.next_out = page_address(pages[curpage])
+ stream.next_out = kmap_local_page(pages[curpage])
+ poffset;
stream.avail_out = PAGE_SIZE - poffset;
poffset = 0;
@@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
flush_dcache_page(pages[curpage]);
SetPageUptodate(pages[curpage]);
}
+ if (stream.next_out != (unsigned char *)zisofs_sink_page) {
+ kunmap_local(stream.next_out);
+ stream.next_out = NULL;
+ }
curpage++;
}
if (!stream.avail_in)
@@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
}
inflate_out:
zlib_inflateEnd(&stream);
+ if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page)
+ kunmap_local(stream.next_out);
z_eio:
mutex_unlock(&zisofs_zlib_lock);
@@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
}
if (poffset && *pages) {
- memset(page_address(*pages) + poffset, 0,
- PAGE_SIZE - poffset);
- flush_dcache_page(*pages);
+ memzero_page(*pages, poffset, PAGE_SIZE - poffset);
SetPageUptodate(*pages);
}
return 0;
@@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++, index++) {
if (i != full_page)
pages[i] = grab_cache_page_nowait(mapping, index);
- if (pages[i]) {
+ if (pages[i])
ClearPageError(pages[i]);
- kmap(pages[i]);
- }
}
err = zisofs_fill_pages(inode, full_page, pcount, pages);
@@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
flush_dcache_page(pages[i]);
if (i == full_page && err)
SetPageError(pages[i]);
- kunmap(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
put_page(pages[i]);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 88bf20303466..df9d70588b60 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode)
} while (more_entries);
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return 0;
out_nomem:
- if (bh)
- brelse(bh);
+ brelse(bh);
return -ENOMEM;
out_noread:
@@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
ret = 0;
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return ret;
out_badread:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b2b2bc9b88d9..885a7a6cc53e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -122,8 +122,8 @@ static int journal_submit_commit_record(journal_t *journal,
{
struct commit_header *tmp;
struct buffer_head *bh;
- int ret;
struct timespec64 now;
+ blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
*cbh = NULL;
@@ -155,13 +155,11 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH |
- REQ_FUA, bh);
- else
- ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+ write_flags |= REQ_PREFLUSH | REQ_FUA;
+ submit_bh(write_flags, bh);
*cbh = bh;
- return ret;
+ return 0;
}
/*
@@ -570,7 +568,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
jbd2_debug(3, "JBD2: commit phase 2a\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6350d3857c89..2696f43e7239 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -923,10 +923,16 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
bh = journal->j_fc_wbuf[i];
wait_on_buffer(bh);
+ /*
+ * Update j_fc_off so jbd2_fc_release_bufs can release remain
+ * buffer head.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ journal->j_fc_off = i + 1;
+ return -EIO;
+ }
put_bh(bh);
journal->j_fc_wbuf[i] = NULL;
- if (unlikely(!buffer_uptodate(bh)))
- return -EIO;
}
return 0;
@@ -1606,7 +1612,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
- int ret;
+ int ret = 0;
/* Buffer got discarded which means block device got invalidated */
if (!buffer_mapped(bh)) {
@@ -1636,7 +1642,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE | write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
@@ -1644,9 +1650,8 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
ret = -EIO;
}
if (ret) {
- printk(KERN_ERR "JBD2: Error %d detected when updating "
- "journal superblock for %s.\n", ret,
- journal->j_devname);
+ printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
+ journal->j_devname);
if (!is_journal_aborted(journal))
jbd2_journal_abort(journal, ret);
}
@@ -1893,19 +1898,16 @@ static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
journal_superblock_t *sb;
- int err = -EIO;
+ int err;
bh = journal->j_sb_buffer;
J_ASSERT(bh != NULL);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
- }
+ err = bh_read(bh, 0);
+ if (err < 0) {
+ printk(KERN_ERR
+ "JBD2: IO error reading journal superblock\n");
+ goto out;
}
if (buffer_verified(bh))
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index f548479615c6..8286a9ec122f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
err = 0;
failed:
@@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return -ENOMEM;
if (!buffer_uptodate(bh)) {
- /* If this is a brand new buffer, start readahead.
- Otherwise, we assume we are already reading it. */
- if (!buffer_req(bh))
+ /*
+ * If this is a brand new buffer, start readahead.
+ * Otherwise, we assume we are already reading it.
+ */
+ bool need_readahead = !buffer_req(bh);
+
+ bh_read_nowait(bh, 0);
+ if (need_readahead)
do_readahead(journal, offset);
wait_on_buffer(bh);
}
@@ -256,6 +261,7 @@ static int fc_do_one_pass(journal_t *journal,
err = journal->j_fc_replay_callback(journal, bh, pass,
next_fc_block - journal->j_fc_first,
expected_commit_id);
+ brelse(bh);
next_fc_block++;
if (err < 0 || err == JBD2_FC_REPLAY_STOP)
break;
@@ -687,7 +693,6 @@ static int do_one_pass(journal_t *journal,
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
- /* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e1be93ccd81c..6a404ac1c178 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -168,7 +168,7 @@ static void wait_transaction_locked(journal_t *journal)
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -194,7 +194,7 @@ static void wait_transaction_switching(journal_t *journal)
read_unlock(&journal->j_state_lock);
return;
}
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
/*
@@ -920,7 +920,7 @@ void jbd2_journal_unlock_updates (journal_t *journal)
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index c6821a509481..4061e0ba7010 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
{
int i, ret;
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
int ret;
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1cc88ba6de90..3990f3e270cb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -472,6 +472,16 @@ static void kernfs_drain(struct kernfs_node *kn)
lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
+ /*
+ * Skip draining if already fully drained. This avoids draining and its
+ * lockdep annotations for nodes which have never been activated
+ * allowing embedding kernfs_remove() in create error paths without
+ * worrying about draining.
+ */
+ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
+ !kernfs_should_drain_open_files(kn))
+ return;
+
up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) {
@@ -480,7 +490,6 @@ static void kernfs_drain(struct kernfs_node *kn)
lock_contended(&kn->dep_map, _RET_IP_);
}
- /* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
@@ -489,7 +498,8 @@ static void kernfs_drain(struct kernfs_node *kn)
rwsem_release(&kn->dep_map, _RET_IP_);
}
- kernfs_drain_open_files(kn);
+ if (kernfs_should_drain_open_files(kn))
+ kernfs_drain_open_files(kn);
down_write(&root->kernfs_rwsem);
}
@@ -695,13 +705,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
goto err_unlock;
}
- /*
- * ACTIVATED is protected with kernfs_mutex but it was clear when
- * @kn was added to idr and we just wanna see it set. No need to
- * grab kernfs_mutex.
- */
- if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
- !atomic_inc_not_zero(&kn->count)))
+ if (unlikely(!kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
spin_unlock(&kernfs_idr_lock);
@@ -743,10 +747,7 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
ret = -ENOENT;
- if (parent->flags & KERNFS_EMPTY_DIR)
- goto out_unlock;
-
- if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+ if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
goto out_unlock;
kn->hash = kernfs_name_hash(kn->name, kn->ns);
@@ -1304,6 +1305,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return pos->parent;
}
+static void kernfs_activate_one(struct kernfs_node *kn)
+{
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
+
+ kn->flags |= KERNFS_ACTIVATED;
+
+ if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
+ return;
+
+ WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+}
+
/**
* kernfs_activate - activate a node which started deactivated
* @kn: kernfs_node whose subtree is to be activated
@@ -1325,15 +1341,42 @@ void kernfs_activate(struct kernfs_node *kn)
down_write(&root->kernfs_rwsem);
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn))) {
- if (pos->flags & KERNFS_ACTIVATED)
- continue;
+ while ((pos = kernfs_next_descendant_post(pos, kn)))
+ kernfs_activate_one(pos);
- WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
- WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+ up_write(&root->kernfs_rwsem);
+}
- atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
- pos->flags |= KERNFS_ACTIVATED;
+/**
+ * kernfs_show - show or hide a node
+ * @kn: kernfs_node to show or hide
+ * @show: whether to show or hide
+ *
+ * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
+ * ignored in future activaitons. If %true, the mark is removed and activation
+ * state is restored. This function won't implicitly activate a new node in a
+ * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
+ *
+ * To avoid recursion complexities, directories aren't supported for now.
+ */
+void kernfs_show(struct kernfs_node *kn, bool show)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+
+ if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
+ return;
+
+ down_write(&root->kernfs_rwsem);
+
+ if (show) {
+ kn->flags &= ~KERNFS_HIDDEN;
+ if (kn->flags & KERNFS_ACTIVATED)
+ kernfs_activate_one(kn);
+ } else {
+ kn->flags |= KERNFS_HIDDEN;
+ if (kernfs_active(kn))
+ atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+ kernfs_drain(kn);
}
up_write(&root->kernfs_rwsem);
@@ -1358,34 +1401,27 @@ static void __kernfs_remove(struct kernfs_node *kn)
pr_debug("kernfs %s: removing\n", kn->name);
- /* prevent any new usage under @kn by deactivating all nodes */
+ /* prevent new usage by marking all nodes removing and deactivating */
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn)))
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ pos->flags |= KERNFS_REMOVING;
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+ }
/* deactivate and unlink the subtree node-by-node */
do {
pos = kernfs_leftmost_descendant(kn);
/*
- * kernfs_drain() drops kernfs_rwsem temporarily and @pos's
+ * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
*/
kernfs_get(pos);
- /*
- * Drain iff @kn was activated. This avoids draining and
- * its lockdep annotations for nodes which have never been
- * activated and allows embedding kernfs_remove() in create
- * error paths without worrying about draining.
- */
- if (kn->flags & KERNFS_ACTIVATED)
- kernfs_drain(pos);
- else
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+ kernfs_drain(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
@@ -1585,8 +1621,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
up_write(&root->kernfs_rwsem);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b3ec34386b43..9ab6c92e02da 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -23,6 +23,8 @@ struct kernfs_open_node {
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
+ unsigned int nr_mmapped;
+ unsigned int nr_to_release;
};
/*
@@ -57,31 +59,17 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
}
/**
- * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
- *
- * @of: associated kernfs_open_file instance.
- * @kn: target kernfs_node.
- *
- * Fetch and return ->attr.open of @kn if @of->list is non empty.
- * If @of->list is not empty we can safely assume that @of is on
- * @kn->attr.open->files list and this guarantees that @kn->attr.open
- * will not vanish i.e. dereferencing outside RCU read-side critical
- * section is safe here.
- *
- * The caller needs to make sure that @of->list is not empty.
+ * of_on - Return the kernfs_open_node of the specified kernfs_open_file
+ * @of: taret kernfs_open_file
*/
-static struct kernfs_open_node *
-kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
+static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
- struct kernfs_open_node *on;
-
- on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
-
- return on;
+ return rcu_dereference_protected(of->kn->attr.open,
+ !list_empty(&of->list));
}
/**
- * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn
+ * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
* @kn: target kernfs_node.
*
@@ -96,7 +84,7 @@ kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
* The caller needs to make sure that kernfs_open_file_mutex is held.
*/
static struct kernfs_open_node *
-kernfs_deref_open_node_protected(struct kernfs_node *kn)
+kernfs_deref_open_node_locked(struct kernfs_node *kn)
{
return rcu_dereference_protected(kn->attr.open,
lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
@@ -207,12 +195,8 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn);
-
- if (!on)
- return -EINVAL;
- of->event = atomic_read(&on->event);
+ of->event = atomic_read(&of_on(of)->event);
return of->kn->attr.ops->seq_show(sf, v);
}
@@ -235,7 +219,6 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
- struct kernfs_open_node *on;
char *buf;
buf = of->prealloc_buf;
@@ -257,14 +240,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_free;
}
- on = kernfs_deref_open_node(of, of->kn);
- if (!on) {
- len = -EINVAL;
- mutex_unlock(&of->mutex);
- goto out_free;
- }
-
- of->event = atomic_read(&on->event);
+ of->event = atomic_read(&of_on(of)->event);
ops = kernfs_ops(of->kn);
if (ops->read)
@@ -553,6 +529,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
rc = 0;
of->mmapped = true;
+ of_on(of)->nr_mmapped++;
of->vm_ops = vma->vm_ops;
vma->vm_ops = &kernfs_vm_ops;
out_put:
@@ -580,31 +557,30 @@ out_unlock:
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
- struct kernfs_open_node *on, *new_on = NULL;
- struct mutex *mutex = NULL;
+ struct kernfs_open_node *on;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
- if (on) {
- list_add_tail(&of->list, &on->files);
- mutex_unlock(mutex);
- return 0;
- } else {
+ if (!on) {
/* not there, initialize a new one */
- new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
- if (!new_on) {
+ on = kzalloc(sizeof(*on), GFP_KERNEL);
+ if (!on) {
mutex_unlock(mutex);
return -ENOMEM;
}
- atomic_set(&new_on->event, 1);
- init_waitqueue_head(&new_on->poll);
- INIT_LIST_HEAD(&new_on->files);
- list_add_tail(&of->list, &new_on->files);
- rcu_assign_pointer(kn->attr.open, new_on);
+ atomic_set(&on->event, 1);
+ init_waitqueue_head(&on->poll);
+ INIT_LIST_HEAD(&on->files);
+ rcu_assign_pointer(kn->attr.open, on);
}
- mutex_unlock(mutex);
+ list_add_tail(&of->list, &on->files);
+ if (kn->flags & KERNFS_HAS_RELEASE)
+ on->nr_to_release++;
+
+ mutex_unlock(mutex);
return 0;
}
@@ -613,6 +589,7 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
*
* @kn: target kernfs_node
* @of: associated kernfs_open_file
+ * @open_failed: ->open() failed, cancel ->release()
*
* Unlink @of from list of @kn's associated open files. If list of
* associated open files becomes empty, disassociate and free
@@ -622,21 +599,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
* None.
*/
static void kernfs_unlink_open_file(struct kernfs_node *kn,
- struct kernfs_open_file *of)
+ struct kernfs_open_file *of,
+ bool open_failed)
{
struct kernfs_open_node *on;
- struct mutex *mutex = NULL;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
if (!on) {
mutex_unlock(mutex);
return;
}
- if (of)
+ if (of) {
+ if (kn->flags & KERNFS_HAS_RELEASE) {
+ WARN_ON_ONCE(of->released == open_failed);
+ if (open_failed)
+ on->nr_to_release--;
+ }
+ if (of->mmapped)
+ on->nr_mmapped--;
list_del(&of->list);
+ }
if (list_empty(&on->files)) {
rcu_assign_pointer(kn->attr.open, NULL);
@@ -763,7 +749,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
return 0;
err_put_node:
- kernfs_unlink_open_file(kn, of);
+ kernfs_unlink_open_file(kn, of, true);
err_seq_release:
seq_release(inode, file);
err_free:
@@ -795,6 +781,7 @@ static void kernfs_release_file(struct kernfs_node *kn,
*/
kn->attr.ops->release(of);
of->released = true;
+ of_on(of)->nr_to_release--;
}
}
@@ -802,15 +789,16 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
struct kernfs_node *kn = inode->i_private;
struct kernfs_open_file *of = kernfs_of(filp);
- struct mutex *mutex = NULL;
if (kn->flags & KERNFS_HAS_RELEASE) {
+ struct mutex *mutex;
+
mutex = kernfs_open_file_mutex_lock(kn);
kernfs_release_file(kn, of);
mutex_unlock(mutex);
}
- kernfs_unlink_open_file(kn, of);
+ kernfs_unlink_open_file(kn, of, false);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
@@ -818,28 +806,33 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
return 0;
}
-void kernfs_drain_open_files(struct kernfs_node *kn)
+bool kernfs_should_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
- struct kernfs_open_file *of;
- struct mutex *mutex = NULL;
-
- if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
- return;
+ bool ret;
/*
- * lockless opportunistic check is safe below because no one is adding to
- * ->attr.open at this point of time. This check allows early bail out
- * if ->attr.open is already NULL. kernfs_unlink_open_file makes
- * ->attr.open NULL only while holding kernfs_open_file_mutex so below
- * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if
- * ->attr.open became NULL while waiting for the mutex.
+ * @kn being deactivated guarantees that @kn->attr.open can't change
+ * beneath us making the lockless test below safe.
*/
- if (!rcu_access_pointer(kn->attr.open))
- return;
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
+ ret = on && (on->nr_mmapped || on->nr_to_release);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+void kernfs_drain_open_files(struct kernfs_node *kn)
+{
+ struct kernfs_open_node *on;
+ struct kernfs_open_file *of;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
if (!on) {
mutex_unlock(mutex);
return;
@@ -848,13 +841,17 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
- if (kn->flags & KERNFS_HAS_MMAP)
+ if (of->mmapped) {
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+ of->mmapped = false;
+ on->nr_mmapped--;
+ }
if (kn->flags & KERNFS_HAS_RELEASE)
kernfs_release_file(kn, of);
}
+ WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
mutex_unlock(mutex);
}
@@ -874,11 +871,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
*/
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
- struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kernfs_deref_open_node(of, kn);
-
- if (!on)
- return EPOLLERR;
+ struct kernfs_open_node *on = of_on(of);
poll_wait(of->file, &on->poll, wait);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3ae214d02d44..fc5821effd97 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
*/
extern const struct file_operations kernfs_file_fops;
+bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
/*
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index c5a5c7b90d72..2a39ffb8423b 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -424,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
NTLMSSP_NEGOTIATE_56);
}
+ if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn))
+ flags |= NTLMSSP_NEGOTIATE_SEAL;
+
if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN)
flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
@@ -984,13 +987,16 @@ out:
return rc;
}
-static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id,
+static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
int enc, u8 *key)
{
struct ksmbd_session *sess;
u8 *ses_enc_key;
- sess = ksmbd_session_lookup_all(conn, ses_id);
+ if (enc)
+ sess = work->sess;
+ else
+ sess = ksmbd_session_lookup_all(work->conn, ses_id);
if (!sess)
return -EINVAL;
@@ -1078,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
return sg;
}
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc)
{
+ struct ksmbd_conn *conn = work->conn;
struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base);
unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc;
@@ -1094,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
struct ksmbd_crypto_ctx *ctx;
- rc = ksmbd_get_encryption_key(conn,
+ rc = ksmbd_get_encryption_key(work,
le64_to_cpu(tr_hdr->SessionId),
enc,
key);
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
index 25b772653de0..362b6159a6cf 100644
--- a/fs/ksmbd/auth.h
+++ b/fs/ksmbd/auth.h
@@ -33,9 +33,10 @@
struct ksmbd_session;
struct ksmbd_conn;
+struct ksmbd_work;
struct kvec;
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc);
void ksmbd_copy_gss_neg_header(void *buf);
int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 756ad631c019..12be8386446a 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -60,6 +60,12 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls("utf8");
if (!conn->local_nls)
conn->local_nls = load_nls_default();
+ if (IS_ENABLED(CONFIG_UNICODE))
+ conn->um = utf8_load(UNICODE_AGE(12, 1, 0));
+ else
+ conn->um = ERR_PTR(-EOPNOTSUPP);
+ if (IS_ERR(conn->um))
+ conn->um = NULL;
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
conn->total_credits = 1;
@@ -350,6 +356,8 @@ out:
wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0);
+ if (IS_ENABLED(CONFIG_UNICODE))
+ utf8_unload(conn->um);
unload_nls(conn->local_nls);
if (default_conn_ops.terminate_fn)
default_conn_ops.terminate_fn(conn);
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index e7f7d5707951..3643354a3fa7 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -14,6 +14,7 @@
#include <net/request_sock.h>
#include <linux/kthread.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#include "smb_common.h"
#include "ksmbd_work.h"
@@ -46,6 +47,7 @@ struct ksmbd_conn {
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
+ struct unicode_map *um;
struct list_head conns_list;
/* smb session 1 per user */
struct xarray sessions;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index e0cbcfa98c7e..ff07c67f4565 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -163,7 +163,8 @@ struct ksmbd_share_config_response {
__u16 force_directory_mode;
__u16 force_uid;
__u16 force_gid;
- __u32 reserved[128]; /* Reserved room */
+ __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME];
+ __u32 reserved[112]; /* Reserved room */
__u32 veto_list_sz;
__s8 ____payload[];
};
diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c
index c9bca1c2c834..328a412259dc 100644
--- a/fs/ksmbd/mgmt/share_config.c
+++ b/fs/ksmbd/mgmt/share_config.c
@@ -16,6 +16,7 @@
#include "user_config.h"
#include "user_session.h"
#include "../transport_ipc.h"
+#include "../misc.h"
#define SHARE_HASH_BITS 3
static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
@@ -26,7 +27,7 @@ struct ksmbd_veto_pattern {
struct list_head list;
};
-static unsigned int share_name_hash(char *name)
+static unsigned int share_name_hash(const char *name)
{
return jhash(name, strlen(name), 0);
}
@@ -72,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share)
return share;
}
-static struct ksmbd_share_config *__share_lookup(char *name)
+static struct ksmbd_share_config *__share_lookup(const char *name)
{
struct ksmbd_share_config *share;
unsigned int key = share_name_hash(name);
@@ -119,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share,
return 0;
}
-static struct ksmbd_share_config *share_config_request(char *name)
+static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config_response *resp;
struct ksmbd_share_config *share = NULL;
@@ -133,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name)
if (resp->flags == KSMBD_SHARE_FLAG_INVALID)
goto out;
+ if (*resp->share_name) {
+ char *cf_resp_name;
+ bool equal;
+
+ cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name);
+ if (IS_ERR(cf_resp_name))
+ goto out;
+ equal = !strcmp(cf_resp_name, name);
+ kfree(cf_resp_name);
+ if (!equal)
+ goto out;
+ }
+
share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL);
if (!share)
goto out;
@@ -190,20 +205,11 @@ out:
return share;
}
-static void strtolower(char *share_name)
-{
- while (*share_name) {
- *share_name = tolower(*share_name);
- share_name++;
- }
-}
-
-struct ksmbd_share_config *ksmbd_share_config_get(char *name)
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config *share;
- strtolower(name);
-
down_read(&shares_table_lock);
share = __share_lookup(name);
if (share)
@@ -212,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name)
if (share)
return share;
- return share_config_request(name);
+ return share_config_request(um, name);
}
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h
index 902f2cb1963a..3fd338293942 100644
--- a/fs/ksmbd/mgmt/share_config.h
+++ b/fs/ksmbd/mgmt/share_config.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/hashtable.h>
#include <linux/path.h>
+#include <linux/unicode.h>
struct ksmbd_share_config {
char *name;
@@ -74,7 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
__ksmbd_share_config_put(share);
}
-struct ksmbd_share_config *ksmbd_share_config_get(char *name);
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name);
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
const char *filename);
#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c
index 97ab7987df6e..8ce17b3fb8da 100644
--- a/fs/ksmbd/mgmt/tree_connect.c
+++ b/fs/ksmbd/mgmt/tree_connect.c
@@ -17,7 +17,7 @@
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- char *share_name)
+ const char *share_name)
{
struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
struct ksmbd_tree_connect_response *resp = NULL;
@@ -26,7 +26,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
struct sockaddr *peer_addr;
int ret;
- sc = ksmbd_share_config_get(share_name);
+ sc = ksmbd_share_config_get(conn->um, share_name);
if (!sc)
return status;
@@ -61,7 +61,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
struct ksmbd_share_config *new_sc;
ksmbd_share_config_del(sc);
- new_sc = ksmbd_share_config_get(share_name);
+ new_sc = ksmbd_share_config_get(conn->um, share_name);
if (!new_sc) {
pr_err("Failed to update stale share config\n");
status.ret = -ESTALE;
diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h
index 71e50271dccf..0f97ddc1e39c 100644
--- a/fs/ksmbd/mgmt/tree_connect.h
+++ b/fs/ksmbd/mgmt/tree_connect.h
@@ -42,7 +42,7 @@ struct ksmbd_session;
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- char *share_name);
+ const char *share_name);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index df991107ad2c..9e8afaa686e3 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/xattr.h>
#include <linux/fs.h>
+#include <linux/unicode.h>
#include "misc.h"
#include "smb_common.h"
@@ -159,7 +160,7 @@ out:
*/
char *convert_to_nt_pathname(struct ksmbd_share_config *share,
- struct path *path)
+ const struct path *path)
{
char *pathname, *ab_pathname, *nt_pathname;
int share_path_len = share->path_sz;
@@ -226,26 +227,53 @@ void ksmbd_conv_path_to_windows(char *path)
strreplace(path, '/', '\\');
}
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name)
+{
+ char *cf_name;
+ int cf_len;
+
+ cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL);
+ if (!cf_name)
+ return ERR_PTR(-ENOMEM);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && um) {
+ const struct qstr q_name = {.name = name, .len = strlen(name)};
+
+ cf_len = utf8_casefold(um, &q_name, cf_name,
+ KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0)
+ goto out_ascii;
+
+ return cf_name;
+ }
+
+out_ascii:
+ cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0) {
+ kfree(cf_name);
+ return ERR_PTR(-E2BIG);
+ }
+
+ for (; *cf_name; ++cf_name)
+ *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name;
+ return cf_name - cf_len;
+}
+
/**
* ksmbd_extract_sharename() - get share name from tree connect request
* @treename: buffer containing tree name and share name
*
* Return: share name on success, otherwise error
*/
-char *ksmbd_extract_sharename(char *treename)
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename)
{
- char *name = treename;
- char *dst;
- char *pos = strrchr(name, '\\');
+ const char *name = treename, *pos = strrchr(name, '\\');
if (pos)
name = (pos + 1);
/* caller has to free the memory */
- dst = kstrdup(name, GFP_KERNEL);
- if (!dst)
- return ERR_PTR(-ENOMEM);
- return dst;
+ return ksmbd_casefold_sharename(um, name);
}
/**
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index aae2a252945f..1facfcd21200 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -15,12 +15,13 @@ int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
char *convert_to_nt_pathname(struct ksmbd_share_config *share,
- struct path *path);
+ const struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
void ksmbd_conv_path_to_windows(char *path);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name);
#define KSMBD_DIR_INFO_ALIGNMENT 8
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
index 5052be9261d9..0ae8d08d85a8 100644
--- a/fs/ksmbd/ndr.c
+++ b/fs/ksmbd/ndr.c
@@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n,
{
unsigned int ref_id = 0x00020000;
int ret;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
n->offset = 0;
n->length = 1024;
@@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n,
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode)));
+ vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)));
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode)));
+ vfsgid = i_gid_into_vfsgid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)));
if (ret)
return ret;
ret = ndr_write_int32(n, inode->i_mode);
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 9046cff4374b..d7d47b82451d 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -1609,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
struct create_posix_rsp *buf;
struct inode *inode = file_inode(fp->filp);
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
buf = (struct create_posix_rsp *)cc;
memset(buf, 0, sizeof(struct create_posix_rsp));
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, nlink));
- buf->ccontext.DataLength = cpu_to_le32(52);
+ /*
+ * DataLength = nlink(4) + reparse_tag(4) + mode(4) +
+ * domain sid(28) + unix group sid(16).
+ */
+ buf->ccontext.DataLength = cpu_to_le32(56);
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, Name));
buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
@@ -1638,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->nlink = cpu_to_le32(inode->i_nlink);
buf->reparse_tag = cpu_to_le32(fp->volatile_id);
- buf->mode = cpu_to_le32(inode->i_mode);
- id_to_sid(from_kuid_munged(&init_user_ns,
- i_uid_into_mnt(user_ns, inode)),
- SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]);
- id_to_sid(from_kgid_munged(&init_user_ns,
- i_gid_into_mnt(user_ns, inode)),
- SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]);
+ buf->mode = cpu_to_le32(inode->i_mode & 0777);
+ /*
+ * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)).
+ * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 4(num_subauth)) + RID(4).
+ * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]);
}
/*
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index ce42bff42ef9..a0d635304754 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -235,10 +235,8 @@ send:
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
rc = conn->ops->encrypt_resp(work);
- if (rc < 0) {
+ if (rc < 0)
conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
- goto send;
- }
}
ksmbd_conn_write(work);
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 19412ac701a6..b2fc85d440d0 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -925,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
*
* Return: true if connection should be encrypted, else false
*/
-static bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
{
if (!conn->ops->generate_encryptionkey)
return false;
@@ -1883,7 +1883,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
goto out_err1;
}
- name = ksmbd_extract_sharename(treename);
+ name = ksmbd_extract_sharename(conn->um, treename);
if (IS_ERR(name)) {
status.ret = KSMBD_TREE_CONN_STATUS_ERROR;
goto out_err1;
@@ -2185,7 +2185,7 @@ out:
* Return: 0 on success, otherwise error
*/
static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
- struct path *path)
+ const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *attr_name = NULL, *value;
@@ -2272,7 +2272,7 @@ next:
return rc;
}
-static noinline int smb2_set_stream_name_xattr(struct path *path,
+static noinline int smb2_set_stream_name_xattr(const struct path *path,
struct ksmbd_file *fp,
char *stream_name, int s_type)
{
@@ -2311,7 +2311,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path,
return 0;
}
-static int smb2_remove_smb_xattrs(struct path *path)
+static int smb2_remove_smb_xattrs(const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *name, *xattr_list = NULL;
@@ -2345,7 +2345,7 @@ out:
return err;
}
-static int smb2_create_truncate(struct path *path)
+static int smb2_create_truncate(const struct path *path)
{
int rc = vfs_truncate(path, 0);
@@ -2364,7 +2364,7 @@ static int smb2_create_truncate(struct path *path)
return rc;
}
-static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
+static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path,
struct ksmbd_file *fp)
{
struct xattr_dos_attrib da = {0};
@@ -2387,7 +2387,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
}
static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
- struct path *path, struct ksmbd_file *fp)
+ const struct path *path, struct ksmbd_file *fp)
{
struct xattr_dos_attrib da;
int rc;
@@ -2447,7 +2447,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
static int smb2_create_sd_buffer(struct ksmbd_work *work,
struct smb2_create_req *req,
- struct path *path)
+ const struct path *path)
{
struct create_context *context;
struct create_sd_buf_req *sd_buf;
@@ -2477,8 +2477,11 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
struct user_namespace *mnt_userns,
struct inode *inode)
{
- fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode);
- fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
+ fattr->cf_uid = vfsuid_into_kuid(vfsuid);
+ fattr->cf_gid = vfsgid_into_kgid(vfsgid);
fattr->cf_mode = inode->i_mode;
fattr->cf_acls = NULL;
fattr->cf_dacls = NULL;
@@ -2761,7 +2764,6 @@ int smb2_open(struct ksmbd_work *work)
} else {
file_present = true;
user_ns = mnt_user_ns(path.mnt);
- generic_fillattr(user_ns, d_inode(path.dentry), &stat);
}
if (stream_name) {
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
@@ -2770,7 +2772,8 @@ int smb2_open(struct ksmbd_work *work)
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
}
} else {
- if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) {
+ if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ s_type == DATA_STREAM) {
rc = -EIO;
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
}
@@ -2787,7 +2790,8 @@ int smb2_open(struct ksmbd_work *work)
}
if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE &&
- S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n",
name, req->CreateOptions);
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
@@ -2797,7 +2801,7 @@ int smb2_open(struct ksmbd_work *work)
if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
!(req->CreateDisposition == FILE_CREATE_LE) &&
- !S_ISDIR(stat.mode)) {
+ !S_ISDIR(d_inode(path.dentry)->i_mode)) {
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
rc = -EIO;
goto err_out;
@@ -3561,17 +3565,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9);
posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev);
posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink);
- posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode);
+ posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777);
posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
posix_info->DosAttributes =
S_ISDIR(ksmbd_kstat->kstat->mode) ?
FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
+ /*
+ * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid),
- SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
+ SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid),
- SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]);
+ SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]);
memcpy(posix_info->name, conv_name, conv_len);
posix_info->name_len = cpu_to_le32(conv_len);
posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset);
@@ -3776,7 +3785,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return 0;
}
-static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
+static bool __query_dir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -3790,27 +3799,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
/* dot and dotdot entries are already reserved */
if (!strcmp(".", name) || !strcmp("..", name))
- return 0;
+ return true;
if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name))
- return 0;
+ return true;
if (!match_pattern(name, namlen, priv->search_pattern))
- return 0;
+ return true;
d_info->name = name;
d_info->name_len = namlen;
rc = reserve_populate_dentry(d_info, priv->info_level);
if (rc)
- return rc;
- if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) {
+ return false;
+ if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY)
d_info->out_buf_len = 0;
- return 0;
- }
- return 0;
-}
-
-static void restart_ctx(struct dir_context *ctx)
-{
- ctx->pos = 0;
+ return true;
}
static int verify_info_level(int info_level)
@@ -3894,8 +3896,7 @@ int smb2_query_dir(struct ksmbd_work *work)
inode_permission(file_mnt_user_ns(dir_fp->filp),
file_inode(dir_fp->filp),
MAY_READ | MAY_EXEC)) {
- pr_err("no right to enumerate directory (%pd)\n",
- dir_fp->filp->f_path.dentry);
+ pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp);
rc = -EACCES;
goto err_out2;
}
@@ -3921,7 +3922,6 @@ int smb2_query_dir(struct ksmbd_work *work)
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
- restart_ctx(&dir_fp->readdir_data.ctx);
}
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
@@ -3968,11 +3968,9 @@ int smb2_query_dir(struct ksmbd_work *work)
*/
if (!d_info.out_buf_len && !d_info.num_entry)
goto no_buf_len;
- if (rc == 0)
- restart_ctx(&dir_fp->readdir_data.ctx);
- if (rc == -ENOSPC)
+ if (rc > 0 || rc == -ENOSPC)
rc = 0;
- if (rc)
+ else if (rc)
goto err_out;
d_info.wptr = d_info.rptr;
@@ -4029,6 +4027,8 @@ err_out2:
rsp->hdr.Status = STATUS_NO_MEMORY;
else if (rc == -EFAULT)
rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ else if (rc == -EIO)
+ rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR;
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
@@ -4158,7 +4158,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
int rc, name_len, value_len, xattr_list_len, idx;
ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0;
struct smb2_ea_info_req *ea_req = NULL;
- struct path *path;
+ const struct path *path;
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
if (!(fp->daccess & FILE_READ_EA_LE)) {
@@ -4495,7 +4495,7 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct smb2_file_stream_info *file_info;
char *stream_name, *xattr_list = NULL, *stream_buf;
struct kstat stat;
- struct path *path = &fp->filp->f_path;
+ const struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
int buf_free_len;
@@ -4720,7 +4720,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
{
struct smb311_posix_qinfo *file_info;
struct inode *inode = file_inode(fp->filp);
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
u64 time;
+ int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
@@ -4735,12 +4739,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->EndOfFile = cpu_to_le64(inode->i_size);
file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
file_info->HardLinks = cpu_to_le32(inode->i_nlink);
- file_info->Mode = cpu_to_le32(inode->i_mode);
+ file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
file_info->DeviceId = cpu_to_le32(inode->i_rdev);
- rsp->OutputBufferLength =
- cpu_to_le32(sizeof(struct smb311_posix_qinfo));
- inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo));
- return 0;
+
+ /*
+ * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
+
+ rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+ inc_rfc1001_len(rsp_org, out_buf_len);
+ return out_buf_len;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4860,8 +4874,8 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- rc = find_file_posix_info(rsp, fp, work->response_buf);
- file_infoclass_size = sizeof(struct smb311_posix_qinfo);
+ file_infoclass_size = find_file_posix_info(rsp, fp,
+ work->response_buf);
}
break;
default:
@@ -5413,7 +5427,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (!pathname)
return -ENOMEM;
- abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
if (IS_ERR(abs_oldname)) {
rc = -EINVAL;
goto out;
@@ -5548,7 +5562,7 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "link name is %s\n", link_name);
- target_name = d_path(&filp->f_path, pathname, PATH_MAX);
+ target_name = file_path(filp, pathname, PATH_MAX);
if (IS_ERR(target_name)) {
rc = -EINVAL;
goto out;
@@ -6266,8 +6280,8 @@ int smb2_read(struct ksmbd_work *work)
goto out;
}
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!work->aux_payload_buf) {
@@ -6531,8 +6545,8 @@ int smb2_write(struct ksmbd_work *work)
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
writethrough, &nbytes);
if (err < 0)
@@ -7643,11 +7657,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- if (in_buf_len < sizeof(struct validate_negotiate_info_req))
- return -EINVAL;
+ if (in_buf_len < offsetof(struct validate_negotiate_info_req,
+ Dialects)) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
- return -EINVAL;
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
@@ -8573,7 +8592,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work)
buf_size += iov[1].iov_len;
work->resp_hdr_sz = iov[1].iov_len;
- rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1);
+ rc = ksmbd_crypt_message(work, iov, rq_nvec, 1);
if (rc)
return rc;
@@ -8592,7 +8611,6 @@ bool smb3_is_transform_hdr(void *buf)
int smb3_decrypt_req(struct ksmbd_work *work)
{
- struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess;
char *buf = work->request_buf;
unsigned int pdu_length = get_rfc1002_len(buf);
@@ -8612,7 +8630,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
return -ECONNABORTED;
}
- sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
+ sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId));
if (!sess) {
pr_err("invalid session id(%llx) in transform header\n",
le64_to_cpu(tr_hdr->SessionId));
@@ -8623,7 +8641,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_len = buf_data_size;
- rc = ksmbd_crypt_message(conn, iov, 2, 0);
+ rc = ksmbd_crypt_message(work, iov, 2, 0);
if (rc)
return rc;
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index af455278d005..092fdd3f8750 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -158,7 +158,8 @@ struct create_posix_rsp {
__le32 nlink;
__le32 reparse_tag;
__le32 mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */
+ u8 SidBuffer[44];
} __packed;
struct smb2_buffer_desc_v1 {
@@ -439,7 +440,8 @@ struct smb2_posix_info {
__le32 HardLinks;
__le32 ReparseTag;
__le32 Mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
+ u8 SidBuffer[32];
__le32 name_len;
u8 name[1];
/*
@@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work);
int smb3_encrypt_resp(struct ksmbd_work *work);
bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work);
int smb2_set_rsp_credits(struct ksmbd_work *work);
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn);
/* smb2 misc functions */
int ksmbd_smb2_check_message(struct ksmbd_work *work);
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 7f8ab14fb8ec..d96da872d70a 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -4,6 +4,8 @@
* Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org>
*/
+#include <linux/user_namespace.h>
+
#include "smb_common.h"
#include "server.h"
#include "misc.h"
@@ -625,8 +627,8 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (!cred)
return -ENOMEM;
- cred->fsuid = make_kuid(current_user_ns(), uid);
- cred->fsgid = make_kgid(current_user_ns(), gid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, gid);
gi = groups_alloc(0);
if (!gi) {
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 3781bca2c8fc..b05ff9b146b5 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns,
uid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id));
+ uid = KUIDT_INIT(id);
+ uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid));
if (uid_valid(uid)) {
fattr->cf_uid = uid;
rc = 0;
@@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns,
gid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id));
+ gid = KGIDT_INIT(id);
+ gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid));
if (gid_valid(gid)) {
fattr->cf_gid = gid;
rc = 0;
@@ -991,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type,
}
int smb_inherit_dacl(struct ksmbd_conn *conn,
- struct path *path,
+ const struct path *path,
unsigned int uid, unsigned int gid)
{
const struct smb_sid *psid, *creator = NULL;
@@ -1185,7 +1187,7 @@ bool smb_inherit_flags(int flags, bool is_dir)
return false;
}
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
@@ -1352,7 +1354,7 @@ err_out:
}
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check)
{
int rc;
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index fcb2c83f2992..618f2e0236b3 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state,
struct posix_acl_entry *pace);
int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid);
bool smb_inherit_flags(int flags, bool is_dir);
-int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path,
unsigned int uid, unsigned int gid);
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid);
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check);
void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
void ksmbd_init_domain(u32 *sub_auth);
@@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth);
static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kuid_t kuid;
+ vfsuid_t vfsuid;
/* If this is an idmapped mount, apply the idmapping. */
- kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid);
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid);
/* Translate the kuid into a userspace id ksmbd would see. */
- return from_kuid(&init_user_ns, kuid);
+ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid));
}
static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kgid_t kgid;
+ vfsgid_t vfsgid;
/* If this is an idmapped mount, apply the idmapping. */
- kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid);
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid);
/* Translate the kgid into a userspace id ksmbd would see. */
- return from_kgid(&init_user_ns, kgid);
+ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid));
}
#endif /* _SMBACL_H */
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 35b55ee94fe5..096eda9ef873 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -32,7 +32,7 @@
/* SMB_DIRECT negotiation timeout in seconds */
#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120
-#define SMB_DIRECT_MAX_SEND_SGES 8
+#define SMB_DIRECT_MAX_SEND_SGES 6
#define SMB_DIRECT_MAX_RECV_SGES 1
/*
@@ -62,13 +62,13 @@ static int smb_direct_receive_credit_max = 255;
static int smb_direct_send_credit_target = 255;
/* The maximum single message size can be sent to remote peer */
-static int smb_direct_max_send_size = 8192;
+static int smb_direct_max_send_size = 1364;
/* The maximum fragmented upper-layer payload receive size supported */
static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
-static int smb_direct_max_receive_size = 8192;
+static int smb_direct_max_receive_size = 1364;
static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
@@ -1527,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
+ ib_drain_qp(t->qp);
+
t->status = SMB_DIRECT_CS_DISCONNECTED;
wake_up_interruptible(&t->wait_status);
wake_up_interruptible(&t->wait_reassembly_queue);
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 143bba4e4db8..63d55f543bd2 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -399,7 +399,8 @@ static int create_socket(struct interface *iface)
ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
if (ret) {
- pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret);
+ if (ret != -EAFNOSUPPORT)
+ pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret);
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
&ksmbd_socket);
if (ret) {
diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h
index 5593024230ae..076f6034a789 100644
--- a/fs/ksmbd/unicode.h
+++ b/fs/ksmbd/unicode.h
@@ -24,6 +24,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
@@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen,
const struct nls_table *codepage);
int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapchars);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
#endif
/*
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 78d01033604c..8de970d6146f 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
if (work->conn->connection_type) {
if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", fp->filp);
return -EACCES;
}
}
@@ -487,8 +486,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
if (work->conn->connection_type) {
if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
- pr_err("no right to write(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
}
@@ -527,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
if (sync) {
err = vfs_fsync_range(filp, offset, offset + *written, 0);
if (err < 0)
- pr_err("fsync failed for filename = %pd, err = %d\n",
- fp->filp->f_path.dentry, err);
+ pr_err("fsync failed for filename = %pD, err = %d\n",
+ fp->filp, err);
}
out:
@@ -543,7 +541,7 @@ out:
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat)
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
@@ -1105,7 +1103,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns,
return err;
}
-static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
+static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -1113,9 +1111,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
buf->dirent_count++;
- if (buf->dirent_count > 2)
- return -ENOTEMPTY;
- return 0;
+ return buf->dirent_count <= 2;
}
/**
@@ -1142,22 +1138,33 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
return err;
}
-static int __caseless_lookup(struct dir_context *ctx, const char *name,
+static bool __caseless_lookup(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
+ int cmp = -EINVAL;
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
if (buf->used != namlen)
- return 0;
- if (!strncasecmp((char *)buf->private, name, namlen)) {
+ return true;
+ if (IS_ENABLED(CONFIG_UNICODE) && buf->um) {
+ const struct qstr q_buf = {.name = buf->private,
+ .len = buf->used};
+ const struct qstr q_name = {.name = name,
+ .len = namlen};
+
+ cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name);
+ }
+ if (cmp < 0)
+ cmp = strncasecmp((char *)buf->private, name, namlen);
+ if (!cmp) {
memcpy((char *)buf->private, name, namlen);
buf->dirent_count = 1;
- return -EEXIST;
+ return false;
}
- return 0;
+ return true;
}
/**
@@ -1168,7 +1175,8 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name,
*
* Return: 0 on success, otherwise error
*/
-static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
+static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
+ size_t namelen, struct unicode_map *um)
{
int ret;
struct file *dfilp;
@@ -1178,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
.private = name,
.used = namelen,
.dirent_count = 0,
+ .um = um,
};
dfilp = dentry_open(dir, flags, current_cred());
@@ -1240,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
break;
err = ksmbd_vfs_lookup_in_dir(&parent, filename,
- filename_len);
+ filename_len,
+ work->conn->um);
path_put(&parent);
if (err)
goto out;
@@ -1743,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
*total_size_written = 0;
if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", src_fp->filp);
return -EACCES;
}
if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
- pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", dst_fp->filp);
return -EACCES;
}
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 70da4c0ba7ad..593059ca8511 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -12,6 +12,7 @@
#include <linux/namei.h>
#include <uapi/linux/xattr.h>
#include <linux/posix_acl.h>
+#include <linux/unicode.h>
#include "smbacl.h"
#include "xattr.h"
@@ -60,6 +61,7 @@ struct ksmbd_readdir_data {
unsigned int used;
unsigned int dirent_count;
unsigned int file_attr;
+ struct unicode_map *um;
};
/* ksmbd kstat wrapper to get valid create time when reading dir entry */
@@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
int ksmbd_vfs_link(struct ksmbd_work *work,
const char *oldname, const char *newname);
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat);
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
char *newname);
int ksmbd_vfs_truncate(struct ksmbd_work *work,
diff --git a/fs/libfs.c b/fs/libfs.c
index 31b0ddf01c31..682d56345a1c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -15,6 +15,7 @@
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
@@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
#endif
}
EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+
+/**
+ * inode_maybe_inc_iversion - increments i_version
+ * @inode: inode with the i_version that should be updated
+ * @force: increment the counter even if it's not necessary?
+ *
+ * Every time the inode is modified, the i_version field must be seen to have
+ * changed by any observer.
+ *
+ * If "force" is set or the QUERIED flag is set, then ensure that we increment
+ * the value, and clear the queried flag.
+ *
+ * In the common case where neither is set, then we can return "false" without
+ * updating i_version.
+ *
+ * If this function returns false, and no other metadata has changed, then we
+ * can avoid logging the metadata.
+ */
+bool inode_maybe_inc_iversion(struct inode *inode, bool force)
+{
+ u64 cur, new;
+
+ /*
+ * The i_version field is not strictly ordered with any other inode
+ * information, but the legacy inode_inc_iversion code used a spinlock
+ * to serialize increments.
+ *
+ * Here, we add full memory barriers to ensure that any de-facto
+ * ordering with other info is preserved.
+ *
+ * This barrier pairs with the barrier in inode_query_iversion()
+ */
+ smp_mb();
+ cur = inode_peek_iversion_raw(inode);
+ do {
+ /* If flag is clear then we needn't do anything */
+ if (!force && !(cur & I_VERSION_QUERIED))
+ return false;
+
+ /* Since lowest bit is flag, add 2 to avoid it */
+ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
+ } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
+ return true;
+}
+EXPORT_SYMBOL(inode_maybe_inc_iversion);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 47ccfcbe0a22..e272ad738faf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,8 +90,14 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
return -ENOMEM;
INIT_LIST_HEAD(&entry->e_list);
- /* Initial hash reference */
- atomic_set(&entry->e_refcnt, 1);
+ /*
+ * We create entry with two references. One reference is kept by the
+ * hash table, the other reference is used to protect us from
+ * mb_cache_entry_delete_or_get() until the entry is fully setup. This
+ * avoids nesting of cache->c_list_lock into hash table bit locks which
+ * is problematic for RT.
+ */
+ atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
entry->e_reusable = reusable;
@@ -106,15 +112,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
}
}
hlist_bl_add_head(&entry->e_hash_list, head);
- /*
- * Add entry to LRU list before it can be found by
- * mb_cache_entry_delete() to avoid races
- */
+ hlist_bl_unlock(head);
spin_lock(&cache->c_list_lock);
list_add_tail(&entry->e_list, &cache->c_list);
cache->c_entry_count++;
spin_unlock(&cache->c_list_lock);
- hlist_bl_unlock(head);
+ mb_cache_entry_put(cache, entry);
return 0;
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 937fa5fae2b8..8afdc408ca4f 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir,
}
static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
int error;
struct inode *inode = minix_new_inode(dir, mode, &error);
if (inode) {
minix_set_inode(inode, 0);
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
}
static int minix_create(struct user_namespace *mnt_userns, struct inode *dir,
diff --git a/fs/namei.c b/fs/namei.c
index 53b4bc094db2..578c2110df02 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -986,7 +986,7 @@ static int nd_jump_root(struct nameidata *nd)
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
-int nd_jump_link(struct path *path)
+int nd_jump_link(const struct path *path)
{
int error = -ELOOP;
struct nameidata *nd = current->nameidata;
@@ -1178,7 +1178,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns,
*
* Returns 0 if successful, -ve on error.
*/
-int may_linkat(struct user_namespace *mnt_userns, struct path *link)
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
{
struct inode *inode = link->dentry->d_inode;
@@ -3583,72 +3583,94 @@ static int do_open(struct nameidata *nd,
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
-struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
- struct dentry *dentry, umode_t mode, int open_flag)
+static int vfs_tmpfile(struct user_namespace *mnt_userns,
+ const struct path *parentpath,
+ struct file *file, umode_t mode)
{
- struct dentry *child = NULL;
- struct inode *dir = dentry->d_inode;
+ struct dentry *child;
+ struct inode *dir = d_inode(parentpath->dentry);
struct inode *inode;
int error;
/* we want directory to be writable */
error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
- goto out_err;
- error = -EOPNOTSUPP;
+ return error;
if (!dir->i_op->tmpfile)
- goto out_err;
- error = -ENOMEM;
- child = d_alloc(dentry, &slash_name);
+ return -EOPNOTSUPP;
+ child = d_alloc(parentpath->dentry, &slash_name);
if (unlikely(!child))
- goto out_err;
+ return -ENOMEM;
+ file->f_path.mnt = parentpath->mnt;
+ file->f_path.dentry = child;
mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
- error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
+ error = dir->i_op->tmpfile(mnt_userns, dir, file, mode);
+ dput(child);
if (error)
- goto out_err;
- error = -ENOENT;
- inode = child->d_inode;
- if (unlikely(!inode))
- goto out_err;
- if (!(open_flag & O_EXCL)) {
+ return error;
+ /* Don't check for other permissions, the inode was just created */
+ error = may_open(mnt_userns, &file->f_path, 0, file->f_flags);
+ if (error)
+ return error;
+ inode = file_inode(file);
+ if (!(file->f_flags & O_EXCL)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
ima_post_create_tmpfile(mnt_userns, inode);
- return child;
+ return 0;
+}
-out_err:
- dput(child);
- return ERR_PTR(error);
+/**
+ * vfs_tmpfile_open - open a tmpfile for kernel internal use
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @parentpath: path of the base directory
+ * @mode: mode of the new tmpfile
+ * @open_flag: flags
+ * @cred: credentials for open
+ *
+ * Create and open a temporary file. The file is not accounted in nr_files,
+ * hence this is only for kernel internal use, and must not be installed into
+ * file tables or such.
+ */
+struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+ const struct path *parentpath,
+ umode_t mode, int open_flag, const struct cred *cred)
+{
+ struct file *file;
+ int error;
+
+ file = alloc_empty_file_noaccount(open_flag, cred);
+ if (!IS_ERR(file)) {
+ error = vfs_tmpfile(mnt_userns, parentpath, file, mode);
+ if (error) {
+ fput(file);
+ file = ERR_PTR(error);
+ }
+ }
+ return file;
}
-EXPORT_SYMBOL(vfs_tmpfile);
+EXPORT_SYMBOL(vfs_tmpfile_open);
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file)
{
struct user_namespace *mnt_userns;
- struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
+
if (unlikely(error))
return error;
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
mnt_userns = mnt_user_ns(path.mnt);
- child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
- error = PTR_ERR(child);
- if (IS_ERR(child))
+ error = vfs_tmpfile(mnt_userns, &path, file, op->mode);
+ if (error)
goto out2;
- dput(path.dentry);
- path.dentry = child;
- audit_inode(nd->name, child, 0);
- /* Don't check for other permissions, the inode was just created */
- error = may_open(mnt_userns, &path, 0, op->open_flag);
- if (!error)
- error = vfs_open(&path, file);
+ audit_inode(nd->name, file->f_path.dentry, 0);
out2:
mnt_drop_write(path.mnt);
out:
@@ -5088,7 +5110,7 @@ int page_symlink(struct inode *inode, const char *symname, int len)
const struct address_space_operations *aops = mapping->a_ops;
bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
unsigned int flags;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d6c2ddc7ea6..58036f657126 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2022,7 +2022,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
err = finish_open(file, dentry, do_open);
if (err)
goto out;
- if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode))
nfs_file_set_open_context(file, ctx);
else
err = -EOPENSTALE;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e032fe201a36..d8ec889a4b3f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -567,7 +567,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
}
wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
lock_page(page);
mapping = page_file_mapping(page);
@@ -655,9 +656,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
if (mntflags & NFS_MOUNT_WRITE_WAIT) {
- result = filemap_fdatawait_range(file->f_mapping,
- iocb->ki_pos - written,
- iocb->ki_pos - 1);
+ filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
result = generic_write_sync(iocb, written);
if (result < 0)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7d285561e59f..1ec79ccf89ad 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -30,14 +30,20 @@
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
#define FF_LAYOUTRETURN_MAXERR 20
+enum nfs4_ff_op_type {
+ NFS4_FF_OP_LAYOUTSTATS,
+ NFS4_FF_OP_LAYOUTRETURN,
+};
+
static unsigned short io_maxretrans;
static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
-static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+static int
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
struct nfs42_layoutstat_devinfo *devinfo,
- int dev_limit);
+ int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
struct nfs4_ff_layout_mirror *mirror);
@@ -1373,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
return -EIO;
}
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1553,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1645,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
- struct nfs_commit_data *cdata)
+static int ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
{
+ if (!pnfs_is_valid_lseg(cdata->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_commit_record_layoutstats_start(task, cdata);
+ return 0;
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
- ff_layout_commit_prepare_common(task, data);
+ if (ff_layout_commit_prepare_common(task, data))
+ return;
+
rpc_call_start(task);
}
@@ -1949,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
ff_layout_initiate_commit);
}
+static bool ff_layout_match_rw(const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr,
+ const struct pnfs_layout_segment *lseg)
+{
+ return hdr->lseg == lseg;
+}
+
+static bool ff_layout_match_commit(const struct rpc_task *task,
+ const struct nfs_commit_data *cdata,
+ const struct pnfs_layout_segment *lseg)
+{
+ return cdata->lseg == lseg;
+}
+
+static bool ff_layout_match_io(const struct rpc_task *task, const void *data)
+{
+ const struct rpc_call_ops *ops = task->tk_ops;
+
+ if (ops == &ff_layout_read_call_ops_v3 ||
+ ops == &ff_layout_read_call_ops_v4 ||
+ ops == &ff_layout_write_call_ops_v3 ||
+ ops == &ff_layout_write_call_ops_v4)
+ return ff_layout_match_rw(task, task->tk_calldata, data);
+ if (ops == &ff_layout_commit_call_ops_v3 ||
+ ops == &ff_layout_commit_call_ops_v4)
+ return ff_layout_match_commit(task, task->tk_calldata, data);
+ return false;
+}
+
+static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ struct nfs4_pnfs_ds *ds;
+ struct nfs_client *ds_clp;
+ struct rpc_clnt *clnt;
+ u32 idx;
+
+ for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
+ mirror = flseg->mirror_array[idx];
+ mirror_ds = mirror->mirror_ds;
+ if (!mirror_ds)
+ continue;
+ ds = mirror->mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
+}
+
static struct pnfs_ds_commit_info *
ff_layout_get_ds_info(struct inode *inode)
{
@@ -2161,8 +2244,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
FF_LAYOUTRETURN_MAXERR);
spin_lock(&args->inode->i_lock);
- ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &ff_args->devinfo[0],
+ ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN);
spin_unlock(&args->inode->i_lock);
args->ld_private->ops = &layoutreturn_ops;
@@ -2396,7 +2480,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
static int
ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
struct nfs42_layoutstat_devinfo *devinfo,
- int dev_limit)
+ int dev_limit, enum nfs4_ff_op_type type)
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
@@ -2408,7 +2492,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
break;
if (IS_ERR_OR_NULL(mirror->mirror_ds))
continue;
- if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
continue;
/* mirror refcount put in cleanup_layoutstats */
if (!refcount_inc_not_zero(&mirror->ref))
@@ -2448,7 +2534,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
spin_lock(&args->inode->i_lock);
ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &args->devinfo[0], dev_count);
+ &args->devinfo[0],
+ dev_count,
+ NFS4_FF_OP_LAYOUTSTATS);
spin_unlock(&args->inode->i_lock);
if (!args->num_dev) {
kfree(args->devinfo);
@@ -2501,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.prepare_layoutreturn = ff_layout_prepare_layoutreturn,
.sync = pnfs_nfs_generic_sync,
.prepare_layoutstats = ff_layout_prepare_layoutstats,
+ .cancel_io = ff_layout_cancel_io,
};
static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bea7c005119c..6b2cfa59a1a2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-static int nfs_wait_killable(int mode)
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
}
-
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
-{
- return nfs_wait_killable(mode);
-}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
/**
@@ -318,7 +313,7 @@ struct nfs_find_desc {
static int
nfs_find_actor(struct inode *inode, void *opaque)
{
- struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_find_desc *desc = opaque;
struct nfs_fh *fh = desc->fh;
struct nfs_fattr *fattr = desc->fattr;
@@ -336,7 +331,7 @@ nfs_find_actor(struct inode *inode, void *opaque)
static int
nfs_init_locked(struct inode *inode, void *opaque)
{
- struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_find_desc *desc = opaque;
struct nfs_fattr *fattr = desc->fattr;
set_nfs_fileid(inode, fattr->fileid);
@@ -1332,7 +1327,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
*/
for (;;) {
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (ret)
goto out;
spin_lock(&inode->i_lock);
@@ -2271,7 +2267,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
static void init_once(void *foo)
{
- struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+ struct nfs_inode *nfsi = foo;
inode_init_once(&nfsi->vfs_inode);
INIT_LIST_HEAD(&nfsi->open_files);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 898dd95bc7a7..d914d609b85b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode);
extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
const struct nfs_pgio_completion_ops *compl_ops);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_commit_free(struct nfs_commit_data *p);
-extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
struct nfs_commit_data *data,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1597eef40d54..2e7579626cf0 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX)
break;
- freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6dab9e408372..13424f0d793b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -341,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
}
- status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ status = nfs_filemap_write_and_wait_range(src->f_mapping,
pos_src, pos_src + (loff_t)count - 1);
if (status)
return status;
@@ -1175,6 +1175,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
&res.seq_res, 1);
+ trace_nfs4_removexattr(inode, name, ret);
if (!ret)
nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
@@ -1214,6 +1215,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 1);
+ trace_nfs4_setxattr(inode, name, ret);
for (; np > 0; np--)
put_page(pages[np - 1]);
@@ -1246,6 +1248,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 0);
+ trace_nfs4_getxattr(inode, name, ret);
if (ret < 0)
return ret;
@@ -1317,6 +1320,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 0);
+ trace_nfs4_listxattr(inode, ret);
if (ret >= 0) {
ret = res.copied;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index a9bf09fdf2c3..76ae11834206 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
static void nfs4_xattr_cache_init_once(void *p)
{
- struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p;
+ struct nfs4_xattr_cache *cache = p;
spin_lock_init(&cache->listxattr_lock);
atomic_long_set(&cache->nent, 0);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index b56f05113d36..fe1aeb0f048f 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -569,6 +569,14 @@ static int decode_listxattrs(struct xdr_stream *xdr,
*/
if (status == -ETOOSMALL)
status = -ERANGE;
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_NOXATTR
+ * should be translated to success with zero-length reply.
+ */
+ if (status == -ENODATA) {
+ res->eof = true;
+ status = 0;
+ }
goto out;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 79df6e83881b..400a71e75238 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -459,7 +459,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
/* nfs4renewd.c */
extern void nfs4_schedule_state_renewal(struct nfs_client *);
-extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3c5678aec006..7a5162afa5c0 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
goto error;
ip_addr = (const char *)buf;
}
- strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+ strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
err = nfs_idmap_new(clp);
if (err < 0) {
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index ec6afd3c4bca..e3fdd2f45b01 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
- struct idmap *idmap = (struct idmap *)aux;
+ struct idmap *idmap = aux;
struct key *key = rka->target_key;
int ret = -ENOKEY;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3ed14a2a84a4..e2efcd26336c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -416,8 +416,8 @@ static int nfs4_delay_killable(long *timeout)
{
might_sleep();
- freezable_schedule_timeout_killable_unsafe(
- nfs4_update_delay(timeout));
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
return 0;
return -EINTR;
@@ -427,7 +427,8 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
- freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
return 0;
return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
@@ -6607,7 +6608,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
struct nfs4_delegreturndata *d_data;
struct pnfs_layout_hdr *lo;
- d_data = (struct nfs4_delegreturndata *)data;
+ d_data = data;
if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) {
nfs4_sequence_done(task, &d_data->res.seq_res);
@@ -7406,7 +7407,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
status = nfs4_proc_setlk(state, cmd, request);
if ((status != -EAGAIN) || IS_SETLK(cmd))
break;
- freezable_schedule_timeout_interruptible(timeout);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(timeout);
timeout *= 2;
timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
status = -ERESTARTSYS;
@@ -7474,10 +7476,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
break;
status = -ERESTARTSYS;
- freezer_do_not_count();
- wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+ wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
NFS4_LOCK_MAXTIMEOUT);
- freezer_count();
} while (!signalled());
remove_wait_queue(q, &waiter.wait);
@@ -8900,7 +8900,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
void *data)
{
- struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+ struct nfs4_add_xprt_data *adata = data;
struct rpc_task *task;
int status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9bab3e9c702a..c3503fb26fa2 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
sp = kzalloc(sizeof(*sp), gfp_flags);
if (!sp)
return NULL;
- sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0,
- gfp_flags);
+ sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags);
if (sp->so_seqid.owner_id < 0) {
kfree(sp);
return NULL;
@@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
nfs4_destroy_seqid_counter(&sp->so_seqid);
put_cred(sp->so_cred);
- ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
+ ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
kfree(sp);
}
@@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = fl_owner;
- lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id,
- 0, 0, GFP_KERNEL_ACCOUNT);
+ lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
@@ -890,7 +888,7 @@ out_free:
void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
+ ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id);
nfs4_destroy_seqid_counter(&lsp->ls_seqid);
kfree(lsp);
}
@@ -1314,7 +1312,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
refcount_inc(&clp->cl_count);
res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (res)
goto out;
if (clp->cl_cons_state < 0)
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 6ee6ad3674a2..2cff5901c689 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error,
)
);
+#ifdef CONFIG_NFS_V4_2
TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
@@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
{ NFS4_CONTENT_DATA, "DATA" }, \
{ NFS4_CONTENT_HOLE, "HOLE" })
-#ifdef CONFIG_NFS_V4_2
TRACE_EVENT(nfs4_llseek,
TP_PROTO(
const struct inode *inode,
@@ -2496,6 +2496,54 @@ TRACE_EVENT(nfs4_offload_cancel,
__entry->stateid_seq, __entry->stateid_hash
)
);
+
+DECLARE_EVENT_CLASS(nfs4_xattr_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const char *name,
+ int error
+ ),
+
+ TP_ARGS(inode, name, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(name, name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __assign_str(name, name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%s",
+ -__entry->error, show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __get_str(name)
+ )
+);
+#define DEFINE_NFS4_XATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_xattr_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const char *name, \
+ int error \
+ ), \
+ TP_ARGS(inode, name, error))
+DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr);
+
+DEFINE_NFS4_INODE_EVENT(nfs4_listxattr);
#endif /* CONFIG_NFS_V4_2 */
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index fa148308822c..620329b7e6ae 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line)
ROOT_DEV = Root_NFS;
if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
- strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+ strscpy(nfs_root_parms, line, sizeof(nfs_root_parms));
} else {
size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
if (n >= sizeof(nfs_root_parms))
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2613b7e36eb9..a5db5158c634 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_seq,
lseg->pls_range.offset, lseg->pls_range.length);
- if (!mark_lseg_invalid(lseg, tmp_list))
- remaining++;
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
+ pnfs_lseg_cancel_io(server, lseg);
}
dprintk("%s:Return %i\n", __func__, remaining);
return remaining;
@@ -1908,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
pnfs_layoutcommit_inode(lo->plh_inode, false);
return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
}
static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
@@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ pnfs_lseg_cancel_io(server, lseg);
}
if (remaining) {
@@ -3192,7 +3197,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = wait_on_bit_lock_action(&nfsi->flags,
NFS_INO_LAYOUTCOMMITTING,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (status)
goto out;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f331f067691b..e3e6a41f19de 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type {
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+
+ void (*cancel_io)(struct pnfs_layout_segment *lseg);
};
struct pnfs_commit_ops {
@@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page
req_offset(req), req_last);
}
+static inline void pnfs_lseg_cancel_io(struct nfs_server *server,
+ struct pnfs_layout_segment *lseg)
+{
+ if (server->pnfs_curr_ld->cancel_io)
+ server->pnfs_curr_ld->cancel_io(lseg);
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 657c242a18ff..987c88ddeaf0 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
return NULL;
}
-/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request
* for @page
* @cinfo - commit info for current inode
* @page - page to search for matching head request
*
- * Returns a the head request if one is found, otherwise returns NULL.
+ * Return: the head request if one is found, otherwise %NULL.
*/
struct nfs_page *
pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d5c57360b418..29a62db155fb 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -405,22 +405,15 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-/*
- * Return true if the file was unhashed.
- */
-static bool
+static void
nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
{
trace_nfsd_file_unhash_and_dispose(nf);
- if (!nfsd_file_unhash(nf))
- return false;
- /* keep final reference for nfsd_file_lru_dispose */
- if (refcount_dec_not_one(&nf->nf_ref))
- return true;
-
- nfsd_file_lru_remove(nf);
- list_add(&nf->nf_lru, dispose);
- return true;
+ if (nfsd_file_unhash(nf)) {
+ /* caller must call nfsd_file_dispose_list() later */
+ nfsd_file_lru_remove(nf);
+ list_add(&nf->nf_lru, dispose);
+ }
}
static void
@@ -562,8 +555,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
* @lock: LRU list lock (unused)
* @arg: dispose list
*
- * Note this can deadlock with nfsd_file_cache_purge.
- *
* Return values:
* %LRU_REMOVED: @item was removed from the LRU
* %LRU_ROTATE: @item is to be moved to the LRU tail
@@ -748,8 +739,6 @@ nfsd_file_close_inode(struct inode *inode)
*
* Walk the LRU list and close any entries that have not been used since
* the last scan.
- *
- * Note this can deadlock with nfsd_file_cache_purge.
*/
static void
nfsd_file_delayed_close(struct work_struct *work)
@@ -891,16 +880,12 @@ out_err:
goto out;
}
-/*
- * Note this can deadlock with nfsd_file_lru_cb.
- */
static void
__nfsd_file_cache_purge(struct net *net)
{
struct rhashtable_iter iter;
struct nfsd_file *nf;
LIST_HEAD(dispose);
- bool del;
rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
do {
@@ -910,14 +895,7 @@ __nfsd_file_cache_purge(struct net *net)
while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- del = nfsd_file_unhash_and_dispose(nf, &dispose);
-
- /*
- * Deadlock detected! Something marked this entry as
- * unhased, but hasn't removed it from the hash list.
- */
- WARN_ON_ONCE(!del);
-
+ nfsd_file_unhash_and_dispose(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}
@@ -1064,9 +1042,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
.need = may_flags & NFSD_FILE_MAY_MASK,
.net = SVC_NET(rqstp),
};
- struct nfsd_file *nf, *new;
- bool retry = true;
+ bool open_retry = true;
+ struct nfsd_file *nf;
__be32 status;
+ int ret;
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
@@ -1076,35 +1055,33 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
key.cred = get_current_cred();
retry:
- /* Avoid allocation if the item is already in cache */
- nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
- nfsd_file_rhash_params);
+ rcu_read_lock();
+ nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
if (nf)
nf = nfsd_file_get(nf);
+ rcu_read_unlock();
if (nf)
goto wait_for_construction;
- new = nfsd_file_alloc(&key, may_flags);
- if (!new) {
+ nf = nfsd_file_alloc(&key, may_flags);
+ if (!nf) {
status = nfserr_jukebox;
goto out_status;
}
- nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl,
- &key, &new->nf_rhash,
- nfsd_file_rhash_params);
- if (!nf) {
- nf = new;
- goto open_file;
- }
- if (IS_ERR(nf))
- goto insert_err;
- nf = nfsd_file_get(nf);
- if (nf == NULL) {
- nf = new;
+ ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl,
+ &key, &nf->nf_rhash,
+ nfsd_file_rhash_params);
+ if (likely(ret == 0))
goto open_file;
- }
- nfsd_file_slab_free(&new->nf_rcu);
+
+ nfsd_file_slab_free(&nf->nf_rcu);
+ if (ret == -EEXIST)
+ goto retry;
+ trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret);
+ status = nfserr_jukebox;
+ goto out_status;
wait_for_construction:
wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
@@ -1112,11 +1089,11 @@ wait_for_construction:
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
- if (!retry) {
+ if (!open_retry) {
status = nfserr_jukebox;
goto out;
}
- retry = false;
+ open_retry = false;
nfsd_file_put_noref(nf);
goto retry;
}
@@ -1164,13 +1141,6 @@ open_file:
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
goto out;
-
-insert_err:
- nfsd_file_slab_free(&new->nf_rcu);
- trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf));
- nf = NULL;
- status = nfserr_jukebox;
- goto out_status;
}
/**
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5d680045fa2c..78b8cd9651d5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -266,7 +266,7 @@ struct nfs4_dir_ctx {
struct list_head names;
};
-static int
+static bool
nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
- return 0;
+ return true;
entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
if (entry == NULL)
- return -ENOMEM;
+ return false;
memcpy(entry->name, name, HEXDIR_LEN - 1);
entry->name[HEXDIR_LEN - 1] = '\0';
list_add(&entry->list, &ctx->names);
- return 0;
+ return true;
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 198d7abf34e4..4e718500a00c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4375,8 +4375,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
nn->nfsd4_grace = 90;
nn->somebody_reclaimed = false;
nn->track_reclaim_completes = false;
- nn->clverifier_counter = prandom_u32();
- nn->clientid_base = prandom_u32();
+ nn->clverifier_counter = get_random_u32();
+ nn->clientid_base = get_random_u32();
nn->clientid_counter = nn->clientid_base + 1;
nn->s2s_cp_cl_id = nn->clientid_counter++;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a29bcfc9390..dc74a947a440 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1458,12 +1458,14 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_drc_error;
retval = nfsd_reply_cache_init(nn);
if (retval)
- goto out_drc_error;
+ goto out_cache_error;
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
return 0;
+out_cache_error:
+ nfsd4_leases_net_shutdown(nn);
out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index d73434200df9..8c52b6c9d31a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -392,8 +392,8 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
- trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
out:
+ trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
if (error == nfserr_stale)
nfsd_stats_fh_stale_inc(exp);
return error;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 83be89905cbf..f650afedd67f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1854,7 +1854,7 @@ struct readdir_data {
int full;
};
-static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1866,7 +1866,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
if (buf->used + reclen > PAGE_SIZE) {
buf->full = 1;
- return -EINVAL;
+ return false;
}
de->namlen = namlen;
@@ -1876,7 +1876,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
memcpy(de->name, name, namlen);
buf->used += reclen;
- return 0;
+ return true;
}
static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 9f4d9432d38a..b9d15c3df3cc 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
nextmaxkey = (nchildren > 1) ?
nilfs_btree_node_get_key(node, nchildren - 2) : 0;
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
}
@@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
ptrs[i] = le64_to_cpu(dptrs[i]);
}
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
return nitems;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 67f63cfeade5..232dd7b6cca1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
+ struct buffer_head *bh;
int err = -ENOMEM;
ino_t ino;
@@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
- err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+ if (unlikely(ino < NILFS_USER_INO)) {
+ nilfs_warn(sb,
+ "inode bitmap is inconsistent for reserved inodes");
+ do {
+ brelse(bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
+ if (unlikely(err))
+ goto failed_ifile_create_inode;
+ } while (ino < NILFS_USER_INO);
+
+ nilfs_info(sb, "repaired inode bitmap for reserved inodes");
+ }
+ ii->i_bh = bh;
+
atomic64_inc(&root->inodes_count);
inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_ino = ino;
@@ -440,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
+ return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
return -ESTALE; /* this inode is deleted */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3267e96c256c..39b7eea2642a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff)
{
- unsigned int i;
+ unsigned int i, nr_folios;
pgoff_t index;
- unsigned int nblocks_in_page;
unsigned long length = 0;
- sector_t b;
- struct pagevec pvec;
- struct page *page;
+ struct folio_batch fbatch;
+ struct folio *folio;
if (inode->i_mapping->nrpages == 0)
return 0;
index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
- nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
repeat:
- pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
- pvec.pages);
- if (pvec.nr == 0)
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX,
+ &fbatch);
+ if (nr_folios == 0)
return length;
- if (length > 0 && pvec.pages[0]->index > index)
- goto out;
-
- b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
i = 0;
do {
- page = pvec.pages[i];
+ folio = fbatch.folios[i];
- lock_page(page);
- if (page_has_buffers(page)) {
+ folio_lock(folio);
+ if (folio_buffers(folio)) {
struct buffer_head *bh, *head;
+ sector_t b;
- bh = head = page_buffers(page);
+ b = folio->index << (PAGE_SHIFT - inode->i_blkbits);
+ bh = head = folio_buffers(folio);
do {
if (b < start_blk)
continue;
@@ -529,21 +524,17 @@ repeat:
} else {
if (length > 0)
goto out_locked;
-
- b += nblocks_in_page;
}
- unlock_page(page);
+ folio_unlock(folio);
- } while (++i < pagevec_count(&pvec));
+ } while (++i < nr_folios);
- index = page->index + 1;
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
goto repeat;
out_locked:
- unlock_page(page);
-out:
- pagevec_release(&pvec);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
return length;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0afe0832c754..b4cebad21b48 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else
- WARN_ON(err == -EINVAL || err == -ENOENT);
-
+ } else if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint creation failed due to metadata corruption.");
+ err = -EIO;
+ }
return err;
}
@@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
&raw_cp, &bh_cp);
if (unlikely(err)) {
- WARN_ON(err == -EINVAL || err == -ENOENT);
+ if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint finalization failed due to metadata corruption.");
+ err = -EIO;
+ }
goto failed_ibh;
}
raw_cp->cp_snapshot_list.ssl_next = 0;
@@ -2235,7 +2241,6 @@ int nilfs_construct_segment(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
struct nilfs_transaction_info *ti;
- int err;
if (!sci)
return -EROFS;
@@ -2243,8 +2248,7 @@ int nilfs_construct_segment(struct super_block *sb)
/* A call inside transactions causes a deadlock. */
BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
- err = nilfs_segctor_sync(sci);
- return err;
+ return nilfs_segctor_sync(sci);
}
/**
@@ -2786,10 +2790,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
- if (err) {
- kfree(nilfs->ns_writer);
- nilfs->ns_writer = NULL;
- }
+ if (unlikely(err))
+ nilfs_detach_log_writer(sb);
+
return err;
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index cd7d09a569ff..a2a15bc4df28 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,7 +18,7 @@
#include "fanotify.h"
-static bool fanotify_path_equal(struct path *p1, struct path *p2)
+static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
{
return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 1d9f11255c64..57f51a9a3015 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -452,13 +452,7 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
-static inline bool fanotify_event_has_path(struct fanotify_event *event)
-{
- return event->type == FANOTIFY_EVENT_TYPE_PATH ||
- event->type == FANOTIFY_EVENT_TYPE_PATH_PERM;
-}
-
-static inline struct path *fanotify_event_path(struct fanotify_event *event)
+static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
return &FANOTIFY_PE(event)->path;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f0e49a406ffa..4546da4a54f9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -249,7 +249,7 @@ out:
return event;
}
-static int create_fd(struct fsnotify_group *group, struct path *path,
+static int create_fd(struct fsnotify_group *group, const struct path *path,
struct file **file)
{
int client_fd;
@@ -619,7 +619,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
char __user *buf, size_t count)
{
struct fanotify_event_metadata metadata;
- struct path *path = fanotify_event_path(event);
+ const struct path *path = fanotify_event_path(event);
struct fanotify_info *info = fanotify_event_info(event);
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
@@ -1553,7 +1553,7 @@ static int fanotify_test_fid(struct dentry *dentry)
}
static int fanotify_events_supported(struct fsnotify_group *group,
- struct path *path, __u64 mask,
+ const struct path *path, __u64 mask,
unsigned int flags)
{
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 87d8a50ee803..fde74eb333cc 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
*/
extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
-/* allocate and destroy and event holder to attach events to notification/access queues */
-extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
-extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
-
extern struct kmem_cache *fsnotify_mark_connector_cachep;
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..3506f6074288 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 52615e6090e1..a3865bc4a0c6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
u8 *mrec_end = (u8 *)ctx->mrec +
le32_to_cpu(ctx->mrec->bytes_allocated);
- u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(ntfschar);
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
- name_end > mrec_end)
+ u8 *name_end;
+
+ /* check whether ATTR_RECORD wrap */
+ if ((u8 *)a < (u8 *)ctx->mrec)
+ break;
+
+ /* check whether Attribute Record Header is within bounds */
+ if ((u8 *)a > mrec_end ||
+ (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
+ break;
+
+ /* check whether ATTR_RECORD's name is within bounds */
+ name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if (name_end > mrec_end)
break;
+
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
a->type == AT_END))
return -ENOENT;
if (unlikely(!a->length))
break;
+
+ /* check whether ATTR_RECORD's length wrap */
+ if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
+ break;
+ /* check whether ATTR_RECORD's length is within bounds */
+ if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
+ break;
+
if (a->type != type)
continue;
/*
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 58b660dbbee9..c481b14e4fd9 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -527,12 +527,12 @@ err_out:
goto out;
}
-static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
{
lock_buffer(bh);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- return submit_bh(REQ_OP_READ, bh);
+ submit_bh(REQ_OP_READ, bh);
}
/**
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index db0f1995aedd..08c659332e26 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi)
goto err_out;
}
+ /* Sanity check offset to the first attribute */
+ if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
+ ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
+ le16_to_cpu(m->attrs_offset));
+ goto err_out;
+ }
+
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 5d44ceac855b..e92bbd754365 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -560,7 +560,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
buf = (ulong *)bh->b_data;
- used = __bitmap_weight(buf, wbits);
+ used = bitmap_weight(buf, wbits);
if (used < wbits) {
frb = wbits - used;
wnd->free_bits[iw] = frb;
@@ -1364,7 +1364,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
buf = (ulong *)bh->b_data;
__bitmap_clear(buf, b0, blocksize * 8 - b0);
- frb = wbits - __bitmap_weight(buf, wbits);
+ frb = wbits - bitmap_weight(buf, wbits);
wnd->total_zeroes += frb - wnd->free_bits[iw];
wnd->free_bits[iw] = frb;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index e7c494005122..0d611a6c5511 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3819,7 +3819,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
}
log_init_pg_hdr(log, page_size, page_size, 1, 1);
- log_create(log, l_size, 0, get_random_int(), false, false);
+ log_create(log, l_size, 0, get_random_u32(), false, false);
log->ra = ra;
@@ -3893,7 +3893,7 @@ check_restart_area:
/* Do some checks based on whether we have a valid log page. */
if (!rst_info.valid_page) {
- open_log_count = get_random_int();
+ open_log_count = get_random_u32();
goto init_log_instance;
}
open_log_count = le32_to_cpu(ra2->open_log_count);
@@ -4044,7 +4044,7 @@ find_oldest:
memcpy(ra->clients, Add2Ptr(ra2, t16),
le16_to_cpu(ra2->ra_len) - t16);
- log->current_openlog_count = get_random_int();
+ log->current_openlog_count = get_random_u32();
ra->open_log_count = cpu_to_le32(log->current_openlog_count);
log->ra_size = offsetof(struct RESTART_AREA, clients) +
sizeof(struct CLIENT_REC);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 26a76ebfe58f..d5a3afbbbfd8 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -630,12 +630,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
set_bh_page(bh, page, off);
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
+ err = bh_read(bh, 0);
+ if (err < 0)
goto out;
- }
zero_user_segment(page, off + voff, off + block_size);
}
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index af4157f61927..1d65f6ef00ca 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 81c3d65d68fe..694471fc46b8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
*/
if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
p->seen_dot = 1;
- return 0;
+ return true;
}
if (name_len == 2 && !strncmp("..", name, 2) &&
@@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
p->seen_dot_dot = 1;
if (p->dx_dir && p->seen_dot)
- return 1;
+ return false;
- return 0;
+ return true;
}
p->seen_other = 1;
- return 1;
+ return false;
}
static int ocfs2_empty_dir_dx(struct inode *inode,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fa87d89cf754..126671e6caed 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2057,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv {
enum ocfs2_orphan_reco_type orphan_reco_type;
};
-static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2066,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
- return 0;
+ return true;
if (name_len == 2 && !strncmp("..", name, 2))
- return 0;
+ return true;
/* do not include dio entry in case of orphan scan */
if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
(!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN)))
- return 0;
+ return true;
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
- return 0;
+ return true;
if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN))
@@ -2090,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
iput(iter);
- return 0;
+ return true;
}
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
@@ -2099,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
OCFS2_I(iter)->ip_next_orphan = p->head;
p->head = iter;
- return 0;
+ return true;
}
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 961d1cf54388..05f32989bad6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
+ struct ocfs2_dinode *fe = NULL;
struct buffer_head *new_fe_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
@@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
goto leave;
}
+ fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac, meta_ac);
@@ -454,8 +456,11 @@ roll_back:
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -632,18 +637,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
- u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
- int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
- inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
- if (tmp)
- mlog_errno(tmp);
- }
-
- return status;
}
static int ocfs2_mkdir(struct user_namespace *mnt_userns,
@@ -2028,8 +2024,11 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 638d875eccc7..7aebdbf5cc0a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -527,7 +527,7 @@ struct ocfs2_extent_block
* value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
*/
struct ocfs2_slot_map {
-/*00*/ __le16 sm_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots);
/*
* Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
* 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
@@ -548,7 +548,7 @@ struct ocfs2_extended_slot {
* i_size.
*/
struct ocfs2_slot_map_extended {
-/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots);
/*
* Actual size is i_size of the slot_map system file. It should
* match s_max_slots * sizeof(struct ocfs2_extended_slot)
@@ -727,7 +727,7 @@ struct ocfs2_dinode {
struct ocfs2_extent_list i_list;
struct ocfs2_truncate_log i_dealloc;
struct ocfs2_inline_data i_data;
- __u8 i_symlink[0];
+ DECLARE_FLEX_ARRAY(__u8, i_symlink);
} id2;
/* Actual on-disk size is one block */
};
@@ -892,7 +892,7 @@ struct ocfs2_group_desc
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
/*40*/ union {
- __u8 bg_bitmap[0];
+ DECLARE_FLEX_ARRAY(__u8, bg_bitmap);
struct {
/*
* Block groups may be discontiguous when
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1358981e80a3..623db358b1ef 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
}
/*
- * Calculate out the start and number of virtual clusters we need to to CoW.
+ * Calculate out the start and number of virtual clusters we need to CoW.
*
* cpos is vitual start cluster position we want to do CoW in a
* file and write_len is the cluster length.
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index dd77b7aaabf5..317126261523 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
+ strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
if (cluster_name_len)
- strlcpy(new_conn->cc_cluster_name, cluster_name,
+ strscpy(new_conn->cc_cluster_name, cluster_name,
CLUSTER_NAME_MAX + 1);
new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 5805a03d100b..9c74eace3adc 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle,
u32 *cluster_start,
u32 *num_clusters);
/*
- * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * Use this variant of ocfs2_claim_clusters to specify a maximum
* number of clusters smaller than the allocation reserved.
*/
int __ocfs2_claim_clusters(handle_t *handle,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e2cc9eec287c..42c993e53924 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1764,9 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb,
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(REQ_OP_READ, 1, bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
+ if (bh_read(*bh, 0) < 0) {
mlog_errno(-EIO);
brelse(*bh);
*bh = NULL;
@@ -2221,7 +2219,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto out_journal;
}
- strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ strscpy(osb->vol_label, di->id2.i_super.s_label,
OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
diff --git a/fs/open.c b/fs/open.c
index cf7e5c350a54..a81319b6177f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -842,7 +842,9 @@ static int do_dentry_open(struct file *f,
return 0;
}
- if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_inc(inode);
+ } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
@@ -882,8 +884,6 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
- if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
@@ -937,10 +937,7 @@ cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(f->f_path.mnt);
- }
+ put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index e2c2699d8016..9cacce5d55c1 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file)
const struct file_operations orangefs_dir_operations = {
.llseek = orangefs_dir_llseek,
.read = generic_read_dir,
- .iterate = orangefs_dir_iterate,
+ .iterate_shared = orangefs_dir_iterate,
.open = orangefs_dir_open,
.release = orangefs_dir_release
};
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 86810e5d7914..732661aa2680 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -417,9 +417,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
- if (file_inode(file) &&
- file_inode(file)->i_mapping &&
- mapping_nrpages(&file_inode(file)->i_data)) {
+ if (mapping_nrpages(file->f_mapping)) {
if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
gossip_debug(GOSSIP_INODE_DEBUG,
"calling flush_racache on %pU\n",
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index fdde6c56cc3d..f436d8847f08 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,7 +44,7 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
-int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new)
+int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
{
struct dentry *old = oldpath->dentry;
ssize_t list_size, size, value_size = 0;
@@ -132,8 +132,8 @@ out:
return error;
}
-static int ovl_copy_fileattr(struct inode *inode, struct path *old,
- struct path *new)
+static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
+ const struct path *new)
{
struct fileattr oldfa = { .flags_valid = true };
struct fileattr newfa = { .flags_valid = true };
@@ -193,11 +193,11 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
return ovl_real_fileattr_set(new, &newfa);
}
-static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
- struct path *new, loff_t len)
+static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
+ struct file *new_file, loff_t len)
{
+ struct path datapath;
struct file *old_file;
- struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
loff_t cloned;
@@ -206,23 +206,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
bool skip_hole = false;
int error = 0;
- if (len == 0)
- return 0;
+ ovl_path_lowerdata(dentry, &datapath);
+ if (WARN_ON(datapath.dentry == NULL))
+ return -EIO;
- old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
+ old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
- new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
- if (IS_ERR(new_file)) {
- error = PTR_ERR(new_file);
- goto out_fput;
- }
-
/* Try to use clone_file_range to clone up within the same fs */
cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
- goto out;
+ goto out_fput;
/* Couldn't clone, so now we try to copy the data */
/* Check if lower fs supports seek operation */
@@ -282,10 +277,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
len -= bytes;
}
-out:
if (!error && ovl_should_sync(ofs))
error = vfs_fsync(new_file, 0);
- fput(new_file);
out_fput:
fput(old_file);
return error;
@@ -556,30 +549,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
return err;
}
-static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
+static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
- struct inode *inode = d_inode(c->dentry);
- struct path upperpath, datapath;
+ struct file *new_file;
int err;
- ovl_path_upper(c->dentry, &upperpath);
- if (WARN_ON(upperpath.dentry != NULL))
- return -EIO;
+ if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size)
+ return 0;
- upperpath.dentry = temp;
+ new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY);
+ if (IS_ERR(new_file))
+ return PTR_ERR(new_file);
- /*
- * Copy up data first and then xattrs. Writing data after
- * xattrs will remove security.capability xattr automatically.
- */
- if (S_ISREG(c->stat.mode) && !c->metacopy) {
- ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(ofs, &datapath, &upperpath,
- c->stat.size);
- if (err)
- return err;
- }
+ err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size);
+ fput(new_file);
+
+ return err;
+}
+
+static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *inode = d_inode(c->dentry);
+ struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp };
+ int err;
err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp);
if (err)
@@ -662,6 +656,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+ struct path path = { .mnt = ovl_upper_mnt(ofs) };
struct dentry *temp, *upper;
struct ovl_cu_creds cc;
int err;
@@ -688,7 +683,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
if (IS_ERR(temp))
goto unlock;
- err = ovl_copy_up_inode(c, temp);
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ path.dentry = temp;
+ err = ovl_copy_up_data(c, &path);
+ if (err)
+ goto cleanup;
+
+ err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
@@ -732,6 +736,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(c->destdir);
struct dentry *temp, *upper;
+ struct file *tmpfile;
struct ovl_cu_creds cc;
int err;
@@ -739,15 +744,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
- temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
ovl_revert_cu_creds(&cc);
- if (IS_ERR(temp))
- return PTR_ERR(temp);
+ if (IS_ERR(tmpfile))
+ return PTR_ERR(tmpfile);
- err = ovl_copy_up_inode(c, temp);
+ temp = tmpfile->f_path.dentry;
+ if (!c->metacopy && c->stat.size) {
+ err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size);
+ if (err)
+ return err;
+ }
+
+ err = ovl_copy_up_metadata(c, temp);
if (err)
- goto out_dput;
+ goto out_fput;
inode_lock_nested(udir, I_MUTEX_PARENT);
@@ -761,16 +773,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_unlock(udir);
if (err)
- goto out_dput;
+ goto out_fput;
if (!c->metacopy)
ovl_set_upperdata(d_inode(c->dentry));
- ovl_inode_update(d_inode(c->dentry), temp);
+ ovl_inode_update(d_inode(c->dentry), dget(temp));
- return 0;
-
-out_dput:
- dput(temp);
+out_fput:
+ fput(tmpfile);
return err;
}
@@ -872,7 +882,7 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
return true;
}
-static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value)
+static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value)
{
ssize_t res;
char *buf;
@@ -899,7 +909,7 @@ static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value)
static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
- struct path upperpath, datapath;
+ struct path upperpath;
int err;
char *capability = NULL;
ssize_t cap_size;
@@ -908,10 +918,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (WARN_ON(upperpath.dentry == NULL))
return -EIO;
- ovl_path_lowerdata(c->dentry, &datapath);
- if (WARN_ON(datapath.dentry == NULL))
- return -EIO;
-
if (c->stat.size) {
err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS,
&capability);
@@ -919,7 +925,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
goto out;
}
- err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size);
+ err = ovl_copy_up_data(c, &upperpath);
if (err)
goto out_free;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index daff601b5c41..a1a22f58ba18 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -38,7 +38,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct inode *realinode = d_inode(realpath->dentry);
struct inode *inode = file_inode(file);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 0fbcb590af84..9e61511de7a7 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -588,7 +588,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
bool set)
{
struct file *file;
@@ -610,7 +610,7 @@ static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
return err;
}
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
{
int err;
@@ -685,7 +685,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
}
}
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
{
int err;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 69dc577974f8..0fd1d5fdfc72 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -26,7 +26,7 @@ struct ovl_lookup_data {
bool metacopy;
};
-static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d,
+static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d,
size_t prelen, const char *post)
{
int res;
@@ -194,7 +194,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path)
+static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
{
return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ee93c825b06b..eee8f08d32b6 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -208,7 +208,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs,
return err;
}
-static inline ssize_t ovl_do_getxattr(struct path *path, const char *name,
+static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name,
void *value, size_t size)
{
int err, len;
@@ -238,7 +238,7 @@ static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
}
static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
- struct path *path,
+ const struct path *path,
enum ovl_xattr ox, void *value,
size_t size)
{
@@ -310,14 +310,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs,
return err;
}
-static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs,
- struct dentry *dentry, umode_t mode)
+static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
+ struct dentry *dentry, umode_t mode)
{
- struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0);
- int err = PTR_ERR_OR_ZERO(ret);
+ struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
+ struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode,
+ O_LARGEFILE | O_WRONLY, current_cred());
+ int err = PTR_ERR_OR_ZERO(file);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
- return ret;
+ return file;
}
static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
@@ -401,13 +403,13 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
bool ovl_is_whiteout(struct dentry *dentry);
-struct file *ovl_path_open(struct path *path, int flags);
+struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox);
-bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path);
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
struct dentry *upperdentry)
@@ -430,9 +432,9 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
@@ -556,7 +558,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
-int ovl_check_d_type_supported(struct path *realpath);
+int ovl_check_d_type_supported(const struct path *realpath);
int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
@@ -673,8 +675,8 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
@@ -683,7 +685,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
-int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new);
+int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 78f62cc1797b..2b210640036c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
return p;
}
-static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
@@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
struct ovl_cache_entry *p;
if (ovl_cache_entry_find_link(name, len, &newp, &parent))
- return 0;
+ return true;
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, rdd->root);
- return 0;
+ return true;
}
-static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
+static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
list_add_tail(&p->l_node, &rdd->middle);
}
- return rdd->err;
+ return rdd->err == 0;
}
void ovl_cache_free(struct list_head *list)
@@ -250,7 +250,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
}
}
-static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -264,7 +264,7 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
-static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd)
+static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
@@ -291,7 +291,7 @@ static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd)
return err;
}
-static inline int ovl_dir_read(struct path *realpath,
+static inline int ovl_dir_read(const struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
@@ -455,7 +455,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
* copy up origin, call vfs_getattr() on the overlay entry to make
* sure that d_ino will be consistent with st_ino from stat(2).
*/
-static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
{
struct dentry *dir = path->dentry;
@@ -528,7 +528,7 @@ fail:
goto out;
}
-static int ovl_fill_plain(struct dir_context *ctx, const char *name,
+static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -540,14 +540,14 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name,
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
- return 0;
+ return true;
}
-static int ovl_dir_read_impure(struct path *path, struct list_head *list,
+static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct rb_root *root)
{
int err;
@@ -592,7 +592,7 @@ static int ovl_dir_read_impure(struct path *path, struct list_head *list,
return 0;
}
-static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
+static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
@@ -648,7 +648,7 @@ struct ovl_readdir_translate {
bool xinowarn;
};
-static int ovl_fill_real(struct dir_context *ctx, const char *name,
+static bool ovl_fill_real(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -834,7 +834,7 @@ out_unlock:
}
static struct file *ovl_dir_open_realfile(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct file *res;
const struct cred *old_cred;
@@ -1027,7 +1027,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
inode_unlock(upper->d_inode);
}
-static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1036,19 +1036,19 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name,
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
- return 0;
+ return true;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
- return 0;
+ return true;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
-int ovl_check_d_type_supported(struct path *realpath)
+int ovl_check_d_type_supported(const struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
@@ -1065,7 +1065,7 @@ int ovl_check_d_type_supported(struct path *realpath)
#define OVL_INCOMPATDIR_NAME "incompat"
-static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path,
+static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
int level)
{
int err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5da771b218d1..a29a8afe9b26 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/posix_acl_xattr.h>
#include <linux/exportfs.h>
+#include <linux/file.h>
#include "overlayfs.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -908,7 +909,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
return err;
}
-static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
+static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
const char *name)
{
struct kstatfs statfs;
@@ -1366,10 +1367,11 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
}
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *workpath)
+ const struct path *workpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
- struct dentry *temp, *workdir;
+ struct dentry *workdir;
+ struct file *tmpfile;
bool rename_whiteout;
bool d_type;
int fh_type;
@@ -1405,10 +1407,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
- temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
- ofs->tmpfile = !IS_ERR(temp);
+ tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
+ ofs->tmpfile = !IS_ERR(tmpfile);
if (ofs->tmpfile)
- dput(temp);
+ fput(tmpfile);
else
pr_warn("upper fs does not support tmpfile.\n");
@@ -1495,7 +1497,7 @@ out:
}
static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *upperpath)
+ const struct path *upperpath)
{
int err;
struct path workpath = { };
@@ -1538,7 +1540,7 @@ out:
}
static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
- struct ovl_entry *oe, struct path *upperpath)
+ struct ovl_entry *oe, const struct path *upperpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *indexdir;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 87f811c089e4..81a57a8d80d9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -490,7 +490,7 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
-struct file *ovl_path_open(struct path *path, int flags)
+struct file *ovl_path_open(const struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt);
@@ -578,7 +578,7 @@ void ovl_copy_up_end(struct dentry *dentry)
ovl_inode_unlock(d_inode(dentry));
}
-bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path)
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
@@ -591,7 +591,7 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path)
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox)
{
int res;
@@ -971,7 +971,7 @@ err:
}
/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path)
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
@@ -1015,7 +1015,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding)
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding)
{
int res;
char *s, *next, *buf = NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 74ae9fafd25a..42c7ff41c2db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly;
*/
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ return dynamic_dname(buffer, buflen, "pipe:[%lu]",
d_inode(dentry)->i_ino);
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index b4f109875e79..74dc0f571dc9 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -24,6 +24,7 @@
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
+#include <linux/iversion.h>
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -1227,6 +1228,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
}
inode->i_ctime = current_time(inode);
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
set_cached_acl(inode, type, acl);
return 0;
}
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..32b1116ae137 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
default n
help
Provides a fast way to retrieve first level children pids of a task. See
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 99fcbfda8e25..49283b8103c7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -279,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f7e3d971e4..9e479d7d202b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1761,7 +1761,7 @@ out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
@@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ struct vma_iterator vmi;
genradix_init(&fa);
@@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
* routine might require mmap_lock taken in might_fault().
*/
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (++pos <= ctx->pos)
@@ -2728,7 +2731,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
return -ESRCH;
length = security_getprocattr(task, PROC_I(inode)->op.lsm,
- (char*)file->f_path.dentry->d_name.name,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -3196,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *
return 0;
}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ mmput(mm);
+ }
+
+ return 0;
+}
#endif /* CONFIG_KSM */
#ifdef CONFIG_STACKLEAK_METRICS
@@ -3331,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
#ifdef CONFIG_KSM
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
#endif
};
@@ -3668,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
#ifdef CONFIG_KSM
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
#endif
};
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 837971e74109..fe7bfcb7d049 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -4,6 +4,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
+#include "internal.h"
static int devinfo_show(struct seq_file *f, void *v)
{
@@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = {
static int __init proc_devices_init(void)
{
- proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 06a80f78433d..b701d0207edf 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
return pde->flags & PROC_ENTRY_PERMANENT;
}
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -285,7 +290,7 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
+ struct vma_iterator iter;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 592e6dc7c110..2fc92a13f9f8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -17,8 +17,6 @@
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index f32878d9a39f..817981e57223 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
+#include "internal.h"
static int loadavg_proc_show(struct seq_file *m, void *v)
{
@@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
static int __init proc_loadavg_init(void)
{
- proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6e89f0e2fd20..5101131e6047 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
@@ -162,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
static int __init proc_meminfo_init(void)
{
- proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index a2873a617ae8..f2273b164535 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
@@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpageflags_read,
};
@@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecgroup_read,
};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 021e83fe831f..48f2d60bd78a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
-EXPORT_SYMBOL(sysctl_vals);
-
-const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
-EXPORT_SYMBOL_GPL(sysctl_long_vals);
-
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
@@ -1246,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir,
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
- struct ctl_dir *core_parent = NULL;
+ struct ctl_dir *core_parent;
struct ctl_table_header *links;
int err;
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 12901dcf57e2..f4616083faef 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -3,6 +3,7 @@
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
/*
* /proc/softirqs ... display the number of softirqs
@@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v)
static int __init proc_softirqs_init(void)
{
- proc_create_single("softirqs", 0, NULL, show_softirqs);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0023643f8b..8a74cdcc9af0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
-#include <linux/vmacache.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
@@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -2UL;
+ vma = get_gate_vma(priv->mm);
+ }
+
+ return vma;
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
/* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
@@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-EINTR);
}
+ vma_iter_init(&priv->iter, mm, last_addr);
hold_task_mempolicy(priv);
- priv->tail_vma = get_gate_vma(mm);
-
- vma = find_vma(mm, last_addr);
- if (vma)
- return vma;
+ if (last_addr == -2UL)
+ return get_gate_vma(mm);
- return priv->tail_vma;
+ return proc_get_vma(priv, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next, *vma = v;
-
- if (vma == priv->tail_vma)
- next = NULL;
- else if (vma->vm_next)
- next = vma->vm_next;
- else
- next = priv->tail_vma;
-
- *ppos = next ? next->vm_start : -1UL;
-
- return next;
+ if (*ppos == -2UL) {
+ *ppos = -1UL;
+ return NULL;
+ }
+ return proc_get_vma(m->private, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -864,7 +867,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false));
+ hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -877,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct mem_size_stats mss;
- struct mm_struct *mm;
+ struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
- unsigned long last_vma_end = 0;
+ unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return -ESRCH;
- mm = priv->mm;
if (!mm || !mmget_not_zero(mm)) {
ret = -ESRCH;
goto out_put_task;
@@ -899,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_mm;
hold_task_mempolicy(priv);
+ vma = mas_find(&mas, ULONG_MAX);
+
+ if (unlikely(!vma))
+ goto empty_set;
- for (vma = priv->mm->mmap; vma;) {
+ vma_start = vma->vm_start;
+ do {
smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
@@ -909,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* access it for write request.
*/
if (mmap_lock_is_contended(mm)) {
+ mas_pause(&mas);
mmap_read_unlock(mm);
ret = mmap_read_lock_killable(mm);
if (ret) {
@@ -952,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* contains last_vma_end.
* Iterate VMA' from last_vma_end.
*/
- vma = find_vma(mm, last_vma_end - 1);
+ vma = mas_find(&mas, ULONG_MAX);
/* Case 3 above */
if (!vma)
break;
@@ -966,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
smap_gather_stats(vma, &mss, last_vma_end);
}
/* Case 2 above */
- vma = vma->vm_next;
- }
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
- last_vma_end, 0, 0, 0, 0);
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
@@ -1263,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
@@ -1282,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
vma->vm_flags &= ~VM_SOFTDIRTY;
@@ -1294,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
- &cp);
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
flush_tlb_mm(mm);
@@ -1418,9 +1426,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
- if (pm->show_pfn)
+ if (pm->show_pfn) {
+ pgoff_t offset;
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
@@ -1477,7 +1495,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
unsigned long offset;
if (pm->show_pfn) {
- offset = swp_offset(entry) +
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ offset = offset +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a6d21fc0033c..2fd06f52b6a4 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
mmap_read_unlock(mm);
return vsize;
}
@@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
-
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, _p);
}
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+ struct vm_area_struct *vma;
+ unsigned long addr = *pos;
+
+ /* See m_next(). Zero at the start or after lseek. */
+ if (addr == -1UL)
+ return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
@@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EINTR);
}
- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ /* start the next element from addr */
+ vma = find_vma(mm, addr);
+ if (vma)
+ return vma;
mmap_read_unlock(mm);
mmput(mm);
@@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
- struct rb_node *p = _p;
+ struct vm_area_struct *vma = _p;
- (*pos)++;
- return p ? rb_next(p) : NULL;
+ *pos = vma->vm_end;
+ return find_vma(vma->vm_mm, vma->vm_end);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index deb99bc9b7e6..b5343d209381 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
+#include "internal.h"
static int uptime_proc_show(struct seq_file *m, void *v)
{
@@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v)
static int __init proc_uptime_init(void)
{
- proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index b449f186577f..02e3c3cd4a9a 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -5,6 +5,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include "internal.h"
static int version_proc_show(struct seq_file *m, void *v)
{
@@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v)
static int __init proc_version_init(void)
{
- proc_create_single("version", 0, NULL, version_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index b9895afca9d1..85b2fa3b211c 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -470,10 +470,8 @@ out2:
out1:
iput(sbi->inodes);
out:
- if (bh1)
- brelse(bh1);
- if (bh2)
- brelse(bh2);
+ brelse(bh1);
+ brelse(bh2);
outnobh:
kfree(qs);
s->s_fs_info = NULL;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 5f2405994280..0f1493e0f6d0 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
return ret;
}
+static inline int do_check_range(struct super_block *sb, const char *val_name,
+ uint val, uint min_val, uint max_val)
+{
+ if (val < min_val || val > max_val) {
+ quota_error(sb, "Getting %s %u out of range %u-%u",
+ val_name, val, min_val, max_val);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
+ struct qt_disk_dqdbheader *dh)
+{
+ int err = 0;
+
+ err = do_check_range(info->dqi_sb, "dqdh_next_free",
+ le32_to_cpu(dh->dqdh_next_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_prev_free",
+ le32_to_cpu(dh->dqdh_prev_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_entries",
+ le16_to_cpu(dh->dqdh_entries), 0,
+ qtree_dqstr_in_blk(info));
+
+ return err;
+}
+
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
@@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
ret = read_blk(info, blk, buf);
if (ret < 0)
goto out_buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
}
else {
@@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
*err = read_blk(info, blk, buf);
if (*err < 0)
goto out_buf;
+ *err = check_dquot_block_header(info, dh);
+ if (*err)
+ goto out_buf;
} else {
blk = get_free_dqblk(info);
if ((int)blk < 0) {
@@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
}
ref = (__le32 *)buf;
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ ret = do_check_range(dquot->dq_sb, "block", newblk, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (!newblk)
newson = 1;
if (depth == info->dqi_qtree_depth - 1) {
@@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
dh = (struct qt_disk_dqdbheader *)buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
le16_add_cpu(&dh->dqdh_entries, -1);
if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
ret = remove_free_dqentry(info, buf, blk);
@@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
- if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- newblk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
@@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
- if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- blk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
@@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
goto out_buf;
}
for (i = __get_index(info, *id, depth); i < epb; i++) {
- if (ref[i] == cpu_to_le32(0)) {
+ uint blk_no = le32_to_cpu(ref[i]);
+
+ if (blk_no == 0) {
*id += level_inc;
continue;
}
+ ret = do_check_range(info->dqi_sb, "block", blk_no, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (depth == info->dqi_qtree_depth - 1) {
ret = 0;
goto out_buf;
}
- ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+ ret = find_next_id(info, id, blk_no, depth + 1);
if (ret != -ENOENT)
break;
}
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba3525ccc27e..cb240eac5036 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long maxpages, lpages, nr, loop, ret;
+ unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn;
struct inode *inode = file_inode(file);
- struct page **pages = NULL, **ptr, *page;
+ struct folio_batch fbatch;
loff_t isize;
/* the mapping mustn't extend beyond the EOF */
@@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto out_free;
-
- nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages);
- if (nr != lpages)
- goto out_free_pages; /* leave if some pages were missing */
+ folio_batch_init(&fbatch);
+ nr_pages = 0;
+repeat:
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff,
+ ULONG_MAX, &fbatch);
+ if (!nr_folios) {
+ ret = -ENOSYS;
+ return ret;
+ }
+ if (ret == -ENOSYS) {
+ ret = (unsigned long) folio_address(fbatch.folios[0]);
+ pfn = folio_pfn(fbatch.folios[0]);
+ }
/* check the pages for physical adjacency */
- ptr = pages;
- page = *ptr++;
- page++;
- for (loop = lpages; loop > 1; loop--)
- if (*ptr++ != page++)
- goto out_free_pages;
+ for (loop = 0; loop < nr_folios; loop++) {
+ if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) {
+ ret = -ENOSYS;
+ goto out_free; /* leave if not physical adjacent */
+ }
+ nr_pages += folio_nr_pages(fbatch.folios[loop]);
+ if (nr_pages >= lpages)
+ goto out_free; /* successfully found desired pages*/
+ }
+ if (nr_pages < lpages) {
+ folio_batch_release(&fbatch);
+ goto repeat; /* loop if pages are missing */
+ }
/* okay - all conditions fulfilled */
- ret = (unsigned long) page_address(pages[0]);
-out_free_pages:
- ptr = pages;
- for (loop = nr; loop > 0; loop--)
- put_page(*ptr++);
out_free:
- kfree(pages);
+ folio_batch_release(&fbatch);
out:
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index bc66d0173e33..b3257e852820 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
}
static int ramfs_tmpfile(struct user_namespace *mnt_userns,
- struct inode *dir, struct dentry *dentry, umode_t mode)
+ struct inode *dir, struct file *file, umode_t mode)
{
struct inode *inode;
inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
if (!inode)
return -ENOSPC;
- d_tmpfile(dentry, inode);
- return 0;
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
static const struct inode_operations ramfs_dir_inode_operations = {
diff --git a/fs/readdir.c b/fs/readdir.c
index 09e8ed7d4161..9c53edb60c03 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -140,7 +140,7 @@ struct readdir_callback {
int result;
};
-static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+static bool fillonedir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct readdir_callback *buf =
@@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsigned long d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -219,7 +219,7 @@ struct getdents_callback {
int error;
};
-static int filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent __user *dirent, *prev;
@@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents, unsigned int, fd,
@@ -307,7 +307,7 @@ struct getdents_callback64 {
int error;
};
-static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent64 __user *dirent, *prev;
@@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
@@ -397,7 +397,7 @@ struct compat_readdir_callback {
int result;
};
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
+static bool compat_fillonedir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
compat_ulong_t d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -471,7 +471,7 @@ struct compat_getdents_callback {
int error;
};
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct compat_linux_dirent __user *dirent, *prev;
@@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 94addfcefede..9f62da7471c9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -868,7 +868,7 @@ loop_next:
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
spin_lock(lock);
}
put_bh(bh);
@@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s,
if (tbh) {
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_WRITE, 1, &tbh);
+ write_dirty_buffer(tbh, 0);
reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
@@ -2240,7 +2240,7 @@ abort_replay:
}
}
/* read in the log blocks, memcpy to the corresponding real block */
- ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks);
+ bh_read_batch(get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(log_blocks[i]);
@@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
} else
bhlist[j++] = bh;
}
- ll_rw_block(REQ_OP_READ, j, bhlist);
+ bh = bhlist[0];
+ bh_read_nowait(bh, 0);
+ bh_readahead_batch(j - 1, &bhlist[1], 0);
for (i = 1; i < j; i++)
brelse(bhlist[i]);
- bh = bhlist[0];
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 30319dc33c18..84a194b77f19 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last)
to = B_NR_ITEMS(bh);
} else {
from = first;
- to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+ to = min_t(int, last, B_NR_ITEMS(bh));
}
reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 4a7cb16e9345..3dba8acf4e83 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+ strscpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
@@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+ strscpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 8096c74c38ac..7b498a0d060b 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
* using the copy_size var below allows this code to work for
* both shrinking and expanding the FS.
*/
- copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+ copy_size = min(bmap_nr_new, bmap_nr);
copy_size =
copy_size * sizeof(struct reiserfs_list_bitmap_node *);
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 9a293609a022..84c12a1947b2 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s,
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j);
+ bh_readahead(bh[j], REQ_RAHEAD);
}
brelse(bh[j]);
}
@@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
if (!buffer_uptodate(bh) && depth == -1)
depth = reiserfs_write_unlock_nested(sb);
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
wait_on_buffer(bh);
if (depth != -1)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c88cd2ce0665..929acce6e731 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s));
- wait_on_buffer(SB_BUFFER_WITH_SB(s));
- if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+ if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
return 1;
}
@@ -2504,9 +2502,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size - off;
toread = len;
while (toread > 0) {
- tocopy =
- sb->s_blocksize - offset <
- toread ? sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
/*
* Quota files are without tails so we can safely
@@ -2554,8 +2550,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
return -EIO;
}
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 436641369283..8b2d52443f41 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -189,7 +189,7 @@ struct reiserfs_dentry_buf {
struct dentry *dentries[8];
};
-static int
+static bool
fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
- return -ENOSPC;
+ return false;
if (name[0] == '.' && (namelen < 2 ||
(namelen == 2 && name[1] == '.')))
- return 0;
+ return true;
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
dbuf->err = PTR_ERR(dentry);
- return PTR_ERR(dentry);
+ return false;
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
reiserfs_error(dentry->d_sb, "xattr-20003",
@@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry, dbuf->xadir);
dput(dentry);
dbuf->err = -EIO;
- return -EIO;
+ return false;
}
dbuf->dentries[dbuf->count++] = dentry;
- return 0;
+ return true;
}
static void
@@ -797,7 +797,7 @@ struct listxattr_buf {
struct dentry *dentry;
};
-static int listxattr_filler(struct dir_context *ctx, const char *name,
+static bool listxattr_filler(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -813,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
name);
if (!handler /* Unsupported xattr name */ ||
(handler->list && !handler->list(b->dentry)))
- return 0;
+ return true;
size = namelen + 1;
if (b->buf) {
if (b->pos + size > b->size) {
b->pos = -ERANGE;
- return -ERANGE;
+ return false;
}
memcpy(b->buf + b->pos, name, namelen);
b->buf[b->pos + namelen] = 0;
}
b->pos += size;
}
- return 0;
+ return true;
}
/*
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 2cab413fffee..7d605db3bb3b 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp {
#define SMB2_CREATE_REQUEST_LEASE "RqLs"
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74"
+#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10"
+#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83"
+#define SMB2_CREATE_TAG_AAPL "AAPL"
/* Flag (SMB3 open response) values */
#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
diff --git a/fs/super.c b/fs/super.c
index 6a82660e1adb..8d39e4f11cfa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -291,6 +291,7 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
+ fscrypt_destroy_keyring(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -479,7 +480,7 @@ void generic_shutdown_super(struct super_block *sb)
evict_inodes(sb);
/* only nonzero refcount inodes can have marks */
fsnotify_sb_delete(sb);
- fscrypt_sb_delete(sb);
+ fscrypt_destroy_keyring(sb);
security_sb_delete(sb);
if (sb->s_dio_done_wq) {
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index c57b46a352d8..3125e76376ee 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
return ubifs_check_dir_empty(inode) == 0;
}
+/**
+ * ubifs_encrypt - Encrypt data.
+ * @inode: inode which refers to the data node
+ * @dn: data node to encrypt
+ * @in_len: length of data to be compressed
+ * @out_len: allocated memory size for the data area of @dn
+ * @block: logical block number of the block
+ *
+ * This function encrypt a possibly-compressed data in the data node.
+ * The encrypted data length will store in @out_len.
+ */
int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len, int block)
{
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index fc718f6178f2..3f128b9fdfbb 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2467,7 +2467,7 @@ error_dump:
static inline int chance(unsigned int n, unsigned int out_of)
{
- return !!((prandom_u32() % out_of) + 1 <= n);
+ return !!(prandom_u32_max(out_of) + 1 <= n);
}
@@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
/* Fail within 1 minute */
- delay = prandom_u32() % 60000;
+ delay = prandom_u32_max(60000);
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
ubifs_warn(c, "failing after %lums", delay);
} else {
d->pc_delay = 2;
- delay = prandom_u32() % 10000;
+ delay = prandom_u32_max(10000);
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
ubifs_warn(c, "failing after %lu calls", delay);
@@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = prandom_u32() % len;
+ from = prandom_u32_max(len);
/* Corruption span max to end of write unit */
to = min(len, ALIGN(from + 1, c->max_write_size));
@@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
if (ffs)
memset(p + from, 0xFF, to - from);
else
- prandom_bytes(p + from, to - from);
+ get_random_bytes(p + from, to - from);
return to;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 86151889548e..0f29cf201136 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
* @c: UBIFS file-system description object
* @dir: parent directory inode
* @mode: inode mode flags
+ * @is_xattr: whether the inode is xattr inode
*
* This function finds an unused inode number, allocates new inode and
* initializes it. Returns new inode in case of success and an error code in
* case of failure.
*/
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
- umode_t mode)
+ umode_t mode, bool is_xattr)
{
int err;
struct inode *inode;
@@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
current_time(inode);
inode->i_mapping->nrpages = 0;
- err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
- if (err) {
- ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
- goto out_iput;
+ if (!is_xattr) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (err) {
+ ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+ goto out_iput;
+ }
}
switch (mode & S_IFMT) {
@@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
@@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
if (err)
return ERR_PTR(err);
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_free;
@@ -424,8 +427,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
}
static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
@@ -462,7 +466,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
return err;
}
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_budg;
@@ -475,7 +479,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
mutex_lock(&ui->ui_mutex);
insert_inode_hash(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
ubifs_assert(c, ui->dirty);
instantiated = 1;
@@ -489,7 +493,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
ubifs_release_budget(c, &req);
- return 0;
+ return finish_open_simple(file, 0);
out_cancel:
unlock_2_inodes(dir, inode);
@@ -872,7 +876,7 @@ out_fname:
}
/**
- * check_dir_empty - check if a directory is empty or not.
+ * ubifs_check_dir_empty - check if a directory is empty or not.
* @dir: VFS inode object of the directory to check
*
* This function checks if directory @dir is empty. Returns zero if the
@@ -1004,7 +1008,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+ inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
@@ -1091,7 +1095,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
kfree(dev);
err = PTR_ERR(inode);
@@ -1173,7 +1177,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+ inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 75dab0ae3939..d02509920baf 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
{
if (c->double_hash)
- dent->cookie = (__force __le32) prandom_u32();
+ dent->cookie = (__force __le32) get_random_u32();
else
dent->cookie = 0;
}
@@ -1472,23 +1472,25 @@ out_free:
* @block: data block number
* @dn: data node to re-compress
* @new_len: new length
+ * @dn_size: size of the data node @dn in memory
*
* This function is used when an inode is truncated and the last data node of
* the inode has to be re-compressed/encrypted and re-written.
*/
static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
unsigned int block, struct ubifs_data_node *dn,
- int *new_len)
+ int *new_len, int dn_size)
{
void *buf;
- int err, dlen, compr_type, out_len, old_dlen;
+ int err, dlen, compr_type, out_len, data_size;
out_len = le32_to_cpu(dn->size);
buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS);
if (!buf)
return -ENOMEM;
- dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ data_size = dn_size - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
if (IS_ENCRYPTED(inode)) {
@@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
}
if (IS_ENCRYPTED(inode)) {
- err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+ err = ubifs_encrypt(inode, dn, out_len, &data_size, block);
if (err)
goto out;
- out_len = old_dlen;
+ out_len = data_size;
} else {
dn->compr_size = 0;
}
@@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
struct ubifs_trun_node *trun;
struct ubifs_data_node *dn;
int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+ int dn_size;
struct ubifs_inode *ui = ubifs_inode(inode);
ino_t inum = inode->i_ino;
unsigned int blk;
@@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
ubifs_assert(c, S_ISREG(inode->i_mode));
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
- sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
- UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+ dn_size = COMPRESSED_DATA_NODE_BUF_SZ;
- sz += ubifs_auth_node_sz(c);
+ if (IS_ENCRYPTED(inode))
+ dn_size += UBIFS_CIPHER_BLOCK_SIZE;
+
+ sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+ dn_size + ubifs_auth_node_sz(c);
ino = kmalloc(sz, GFP_NOFS);
if (!ino)
@@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) {
ubifs_err(c, "bad data node (block %u, inode %lu)",
blk, inode->i_ino);
- ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ -
- UBIFS_TRUN_NODE_SZ);
+ ubifs_dump_node(c, dn, dn_size);
goto out_free;
}
if (dn_len <= dlen)
dlen = 0; /* Nothing to do */
else {
- err = truncate_data_node(c, inode, blk, dn, &dlen);
+ err = truncate_data_node(c, inode, blk, dn,
+ &dlen, dn_size);
if (err)
goto out_free;
}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d76a19e460cd..cfbc31f709f4 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
if (!dbg_is_chk_gen(c))
return 0;
- if (prandom_u32() & 3)
+ if (prandom_u32_max(4))
return 0;
for (i = 0; i < c->lsave_cnt; i++)
c->lsave[i] = c->main_first;
list_for_each_entry(lprops, &c->empty_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->freeable_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->frdi_idx_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_FREE - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
return 1;
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 58c92c96ecef..01362ad5f804 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
+ if (dbg_is_chk_index(c) && !prandom_u32_max(8))
return -ENOSPC;
return 0;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 7d6d2f152e03..478bbbb5382f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
- umode_t mode);
+ umode_t mode, bool is_xattr);
int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
int ubifs_check_dir_empty(struct inode *dir);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e4c4761aff7f..3db8486e3725 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+ inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_budg;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index cad3772f9dbe..be640f4b2f2c 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index a2adf6293093..16bcf2c6b8b3 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 09aef77269fe..5c659e23e578 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = {
.release = udf_release_file,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d06daed549f..dce6ae9ae306 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1211,13 +1211,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
if (!bh)
return NULL;
- if (buffer_uptodate(bh))
- return bh;
-
- ll_rw_block(REQ_OP_READ, 1, &bh);
-
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
+ if (bh_read(bh, 0) >= 0)
return bh;
brelse(bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b3d5f97f16cd..fb4c30e05245 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir,
}
static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode = udf_new_inode(dir, mode);
@@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_op = &udf_file_inode_operations;
inode->i_fop = &udf_file_operations;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
- return 0;
+ return finish_open_simple(file, 0);
}
static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index bd810d8239f2..2436e3f82147 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, oldb + pos);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ufs_error(inode->i_sb, __func__,
- "read of block failed\n");
- break;
- }
+ if (bh_read(bh, 0) < 0) {
+ ufs_error(inode->i_sb, __func__,
+ "read of block failed\n");
+ break;
}
UFSD(" change from %llu to %llu, pos %u\n",
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0c1d33c4f74c..07c81ab3fd4d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
+#include <linux/miscdevice.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
- if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
- ctx->flags & UFFD_USER_MODE_ONLY) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
+ if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
goto out;
- }
/*
* If it's already released don't get it. This avoids to loop
@@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
if (release_new_ctx) {
struct vm_area_struct *vma;
struct mm_struct *mm = release_new_ctx->mm;
+ VMA_ITERATOR(vmi, mm, 0);
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}
+ }
mmap_write_unlock(mm);
userfaultfd_ctx_put(release_new_ctx);
@@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
return false;
}
-int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *unmaps)
+int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *unmaps)
{
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ VMA_ITERATOR(vmi, mm, start);
+ struct vm_area_struct *vma;
+
+ for_each_vma_range(vmi, vma, end) {
struct userfaultfd_unmap_ctx *unmap_ctx;
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
@@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
WRITE_ONCE(ctx->released, true);
@@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
*/
mmap_write_lock(mm);
prev = NULL;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
cond_resched();
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
!!(vma->vm_flags & __VM_UFFD_FLAGS));
@@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev)
+ if (prev) {
+ mas_pause(&mas);
vma = prev;
- else
+ } else {
prev = vma;
+ }
+
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool found;
bool basic_ioctls;
unsigned long start, end, vma_end;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
*/
found = false;
basic_ioctls = false;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
((struct vm_userfaultfd_ctx){ ctx }),
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto next;
}
@@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
next:
/*
@@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
*/
found = false;
ret = -EINVAL;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1636,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -2056,20 +2074,11 @@ static void init_once_userfaultfd_ctx(void *mem)
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static int new_userfaultfd(int flags)
{
struct userfaultfd_ctx *ctx;
int fd;
- if (!sysctl_unprivileged_userfaultfd &&
- (flags & UFFD_USER_MODE_ONLY) == 0 &&
- !capable(CAP_SYS_PTRACE)) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
- return -EPERM;
- }
-
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
@@ -2102,8 +2111,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
return fd;
}
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+ /* Userspace-only page faults are always allowed */
+ if (flags & UFFD_USER_MODE_ONLY)
+ return true;
+
+ /*
+ * The user is requesting a userfaultfd which can handle kernel faults.
+ * Privileged users are always allowed to do this.
+ */
+ if (capable(CAP_SYS_PTRACE))
+ return true;
+
+ /* Otherwise, access to kernel fault handling is sysctl controlled. */
+ return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ if (!userfaultfd_syscall_allowed(flags))
+ return -EPERM;
+
+ return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+ if (cmd != USERFAULTFD_IOC_NEW)
+ return -EINVAL;
+
+ return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+ .unlocked_ioctl = userfaultfd_dev_ioctl,
+ .compat_ioctl = userfaultfd_dev_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "userfaultfd",
+ .fops = &userfaultfd_dev_fops
+};
+
static int __init userfaultfd_init(void)
{
+ int ret;
+
+ ret = misc_register(&userfaultfd_misc);
+ if (ret)
+ return ret;
+
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
sizeof(struct userfaultfd_ctx),
0,
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 629785c95007..dbe1ce5b450a 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -70,8 +70,6 @@ struct fsverity_info {
const struct inode *inode;
};
-/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
-#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
sizeof(struct fsverity_descriptor))
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e2bdf089c0a3..6261599bb389 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1520,7 +1520,7 @@ xfs_alloc_ag_vextent_lastblock(
#ifdef DEBUG
/* Randomly don't execute the first algorithm. */
- if (prandom_u32() & 1)
+ if (prandom_u32_max(2))
return 0;
#endif
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index e56723dc9cd5..49d0d4ea63fc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -294,7 +294,7 @@ xfs_check_block(
else
thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
if (*thispa == *pp) {
- xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
__func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
xfs_err(mp, "%s: ptrs are equal in node\n",
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e7201dc68f43..e576560b46e9 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int(
*/
mapp = kmem_alloc(sizeof(*mapp) * count, 0);
for (b = *bno, mapi = 0; b < *bno + count; ) {
- nmap = min(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
+ nmap = min(XFS_BMAP_MAX_NMAP, c);
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->total, &mapp[mapi], &nmap);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 76eedc2756b3..92bac3373f1f 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -261,7 +261,7 @@ xfs_dir_createname(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -357,7 +357,7 @@ xfs_dir_lookup(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -435,7 +435,7 @@ xfs_dir_removename(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
@@ -493,7 +493,7 @@ xfs_dir_replace(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -610,19 +610,23 @@ xfs_dir2_grow_inode(
int
xfs_dir2_isblock(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isblock)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (XFS_IS_CORRUPT(args->dp->i_mount,
- rval != 0 &&
- args->dp->i_disk_size != args->geo->blksize))
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isblock = false;
+ if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
+ return 0;
+
+ *isblock = true;
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
return -EFSCORRUPTED;
- *vp = rval;
return 0;
}
@@ -632,14 +636,20 @@ xfs_dir2_isblock(
int
xfs_dir2_isleaf(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isleaf)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- *vp = last == args->geo->leafblk + args->geo->fsbcount;
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isleaf = false;
+ if (eof != args->geo->leafblk + args->geo->fsbcount)
+ return 0;
+
+ *isleaf = true;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index b6df3c34b26a..dd39f17dd9a9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 003812fd7d35..8cd37e6e9d38 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -865,7 +865,6 @@ xfs_dir2_sf_lookup(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
- int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
@@ -929,8 +928,7 @@ xfs_dir2_sf_lookup(
if (!ci_sfep)
return -ENOENT;
/* otherwise process the CI match as required by the caller */
- error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
- return error;
+ return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
}
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 6cdfd64bc56b..94db50eb706a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc(
/* randomly do sparse inode allocations */
if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
- do_sparse = prandom_u32() & 1;
+ do_sparse = prandom_u32_max(2);
#endif
/*
@@ -805,7 +805,7 @@ sparse_alloc:
* number from being easily guessable.
*/
error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
- args.agbno, args.len, prandom_u32());
+ args.agbno, args.len, get_random_u32());
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9327a4f39206..6b21760184d9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -78,7 +78,7 @@ xfs_iformat_local(
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %zd).",
+ "corrupt inode %llu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
@@ -192,7 +192,7 @@ xfs_iformat_btree(
XFS_DFORK_SIZE(dip, mp, whichfork) ||
ifp->if_nextents > ip->i_nblocks) ||
level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
- xfs_warn(mp, "corrupt inode %Lu (btree).",
+ xfs_warn(mp, "corrupt inode %llu (btree).",
(unsigned long long) ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 5abb5fdb71d9..5c87800ab223 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -99,7 +99,7 @@ out:
* we check the inode number to make sure it's sane, then we check that
* we can look up this filename. Finally, we check the ftype.
*/
-STATIC int
+STATIC bool
xchk_dir_actor(
struct dir_context *dir_iter,
const char *name,
@@ -124,7 +124,7 @@ xchk_dir_actor(
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
if (xchk_should_terminate(sdc->sc, &error))
- return error;
+ return !error;
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
@@ -191,8 +191,8 @@ out:
* and return zero to xchk_directory.
*/
if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return -EFSCORRUPTED;
- return error;
+ return false;
+ return !error;
}
/* Scrub a directory btree record. */
@@ -676,7 +676,7 @@ xchk_directory_blocks(
xfs_dablk_t dabno;
xfs_dir2_db_t last_data_db = 0;
bool found;
- int is_block = 0;
+ bool is_block = false;
int error;
/* Ignore local format directories. */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index ab182a5cd0c0..d8dff3fd8053 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -38,7 +38,7 @@ struct xchk_parent_ctx {
};
/* Look for a single entry in a directory pointing to an inode. */
-STATIC int
+STATIC bool
xchk_parent_actor(
struct dir_context *dc,
const char *name,
@@ -62,7 +62,7 @@ xchk_parent_actor(
if (xchk_should_terminate(spc->sc, &error))
spc->cancelled = true;
- return error;
+ return !error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 5077a7ad5646..cf5ce607dc05 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc(
*/
nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
name_len + value_len);
- if (!nv)
- return nv;
nv->name.i_addr = nv + 1;
nv->name.i_len = name_len;
@@ -441,8 +439,6 @@ xfs_attr_create_intent(
attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
args->namelen, args->value, args->valuelen);
}
- if (!attr->xattri_nameval)
- return ERR_PTR(-ENOMEM);
attrip = xfs_attri_init(mp, attr->xattri_nameval);
xfs_trans_add_item(tp, &attrip->attri_item);
@@ -762,8 +758,6 @@ xlog_recover_attri_commit_pass2(
nv = xfs_attri_log_nameval_alloc(attr_name,
attri_formatp->alfi_name_len, attr_value,
attri_formatp->alfi_value_len);
- if (!nv)
- return -ENOMEM;
attrip = xfs_attri_init(mp, nv);
error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index e295fc8062d8..9f3ceb461515 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -512,7 +512,7 @@ xfs_readdir(
{
struct xfs_da_args args = { NULL };
unsigned int lock_mode;
- int isblock;
+ bool isblock;
int error;
trace_xfs_readdir(dp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 296faa41d81d..7db588ed0be5 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -274,7 +274,7 @@ xfs_errortag_test(
ASSERT(error_tag < XFS_ERRTAG_MAX);
randfactor = mp->m_errortag[error_tag];
- if (!randfactor || prandom_u32() % randfactor)
+ if (!randfactor || prandom_u32_max(randfactor))
return false;
xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2bbe7916a998..eae7427062cf 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -596,7 +596,7 @@ xfs_iget_cache_miss(
*/
if (xfs_has_v3inodes(mp) &&
(flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
- VFS_I(ip)->i_generation = prandom_u32();
+ VFS_I(ip)->i_generation = get_random_u32();
} else {
struct xfs_buf *bp;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28493c8e9bb2..c000b74dd203 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -835,9 +835,8 @@ xfs_init_new_inode(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if (irix_sgid_inherit &&
- (inode->i_mode & S_ISGID) &&
- !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
+ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+ !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
inode->i_mode &= ~S_ISGID;
ip->i_disk_size = 0;
@@ -3119,7 +3118,7 @@ xfs_iflush(
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
+ "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto flush_out;
}
@@ -3129,7 +3128,7 @@ xfs_iflush(
ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad regular inode %Lu, ptr "PTR_FMT,
+ "%s: Bad regular inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
@@ -3140,7 +3139,7 @@ xfs_iflush(
ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad directory inode %Lu, ptr "PTR_FMT,
+ "%s: Bad directory inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
@@ -3158,7 +3157,7 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
+ "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_forkoff, ip);
goto flush_out;
}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6e19ece916bf..ca2941ab6cbc 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -550,7 +550,7 @@ xfs_inode_item_push(
if (!bp || (ip->i_flags & XFS_ISTALE)) {
/*
- * Inode item/buffer is being being aborted due to cluster
+ * Inode item/buffer is being aborted due to cluster
* buffer deletion. Trigger a log force to have that operation
* completed and items removed from the AIL before the next push
* attempt.
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index d28ffaebd067..0e5dba2343ea 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2(
*/
if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
+ "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld",
__func__, dip, bp, in_f->ilf_ino);
error = -EFSCORRUPTED;
goto out_release;
@@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2(
ldip = item->ri_buf[1].i_addr;
if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
+ "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
__func__, item, in_f->ilf_ino);
error = -EFSCORRUPTED;
goto out_release;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f51c60d7e205..2e10e1c66ad6 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -167,7 +167,7 @@ xfs_generic_create(
struct dentry *dentry,
umode_t mode,
dev_t rdev,
- bool tmpfile) /* unnamed file */
+ struct file *tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
@@ -234,7 +234,7 @@ xfs_generic_create(
* d_tmpfile can immediately set it back to zero.
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
+ d_tmpfile(tmpfile, inode);
} else
d_instantiate(dentry, inode);
@@ -261,7 +261,7 @@ xfs_vn_mknod(
umode_t mode,
dev_t rdev)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL);
}
STATIC int
@@ -272,7 +272,7 @@ xfs_vn_create(
umode_t mode,
bool flags)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL);
}
STATIC int
@@ -283,7 +283,7 @@ xfs_vn_mkdir(
umode_t mode)
{
return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
- false);
+ NULL);
}
STATIC struct dentry *
@@ -558,6 +558,8 @@ xfs_vn_getattr(
struct inode *inode = d_inode(path->dentry);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
trace_xfs_getattr(ip);
@@ -568,8 +570,8 @@ xfs_vn_getattr(
stat->dev = inode->i_sb->s_dev;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = i_uid_into_mnt(mnt_userns, inode);
- stat->gid = i_gid_into_mnt(mnt_userns, inode);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
@@ -1090,10 +1092,12 @@ STATIC int
xfs_vn_tmpfile(
struct user_namespace *mnt_userns,
struct inode *dir,
- struct dentry *dentry,
+ struct file *file,
umode_t mode)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true);
+ int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file);
+
+ return finish_open_simple(file, err);
}
static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index cb5fc68c9ea0..e570dcb5df8d 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *vap);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 36312b00b164..a1c2bcf65d37 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -66,6 +66,8 @@ xfs_bulkstat_one_int(
struct xfs_bulkstat *buf = bc->buf;
xfs_extnum_t nextents;
int error = -EINVAL;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (xfs_internal_inum(mp, ino))
goto out_advance;
@@ -81,14 +83,16 @@ xfs_bulkstat_one_int(
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* xfs_iget returns the following without needing
* further change.
*/
buf->bs_projectid = ip->i_projid;
buf->bs_ino = ino;
- buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode));
- buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode));
+ buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid));
+ buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid));
buf->bs_size = ip->i_disk_size;
buf->bs_nlink = inode->i_nlink;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 386b0307aed8..f02a0dd522b3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -226,12 +226,12 @@ xlog_ticket_reservation(
if (head == &log->l_write_head) {
ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
return tic->t_unit_res;
- } else {
- if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- return tic->t_unit_res * tic->t_cnt;
- else
- return tic->t_unit_res;
}
+
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ return tic->t_unit_res * tic->t_cnt;
+
+ return tic->t_unit_res;
}
STATIC bool
@@ -3544,7 +3544,7 @@ xlog_ticket_alloc(
tic->t_curr_res = unit_res;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = prandom_u32();
+ tic->t_tid = get_random_u32();
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f10c88cee116..e8bb3c2e847e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -300,26 +300,28 @@ xfs_validate_new_dalign(
"alignment check failed: sunit/swidth vs. blocksize(%d)",
mp->m_sb.sb_blocksize);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- mp->m_sb.sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, mp->m_sb.sb_blocksize);
- return -EINVAL;
- }
}
+ /*
+ * Convert the stripe unit and width to FSBs.
+ */
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
+ return -EINVAL;
+ }
+
+ if (!mp->m_dalign) {
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ }
+
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+
if (!xfs_has_dalign(mp)) {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 5b1f9a24ed59..c4078d0ec108 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -23,17 +23,18 @@
#include <linux/mm.h>
#include <linux/dax.h>
-struct failure_info {
+struct xfs_failure_info {
xfs_agblock_t startblock;
xfs_extlen_t blockcount;
int mf_flags;
+ bool want_shutdown;
};
static pgoff_t
xfs_failure_pgoff(
struct xfs_mount *mp,
const struct xfs_rmap_irec *rec,
- const struct failure_info *notify)
+ const struct xfs_failure_info *notify)
{
loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
@@ -47,7 +48,7 @@ static unsigned long
xfs_failure_pgcnt(
struct xfs_mount *mp,
const struct xfs_rmap_irec *rec,
- const struct failure_info *notify)
+ const struct xfs_failure_info *notify)
{
xfs_agblock_t end_rec;
xfs_agblock_t end_notify;
@@ -71,13 +72,13 @@ xfs_dax_failure_fn(
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip;
- struct failure_info *notify = data;
+ struct xfs_failure_info *notify = data;
int error = 0;
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
- return -EFSCORRUPTED;
+ notify->want_shutdown = true;
+ return 0;
}
/* Get files that incore, filter out others that are not in use. */
@@ -86,8 +87,10 @@ xfs_dax_failure_fn(
/* Continue the rmap query if the inode isn't incore */
if (error == -ENODATA)
return 0;
- if (error)
- return error;
+ if (error) {
+ notify->want_shutdown = true;
+ return 0;
+ }
error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
xfs_failure_pgoff(mp, rec, notify),
@@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure(
xfs_daddr_t bblen,
int mf_flags)
{
+ struct xfs_failure_info notify = { .mf_flags = mf_flags };
struct xfs_trans *tp = NULL;
struct xfs_btree_cur *cur = NULL;
struct xfs_buf *agf_bp = NULL;
@@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure(
for (; agno <= end_agno; agno++) {
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
- struct failure_info notify;
struct xfs_agf *agf;
xfs_agblock_t agend;
struct xfs_perag *pag;
@@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure(
}
xfs_trans_cancel(tp);
+ if (error || notify.want_shutdown) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ if (!error)
+ error = -EFSCORRUPTED;
+ }
return error;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 251f20ddd368..93bdd25680bc 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared(
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
- } else if (fbno == agbno) {
+ }
+
+ if (fbno == agbno) {
/*
* The start of this extent is shared. Truncate the
* mapping at the end of the shared region so that a
@@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared(
irec->br_blockcount = flen;
*shared = true;
return 0;
- } else {
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
- return 0;
}
+
+ /*
+ * There's a shared extent midway through this extent.
+ * Truncate the mapping at the start of the shared
+ * extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = fbno - agbno;
+ return 0;
}
int
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 20e0534a772c..90a77cd3ebad 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
- len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
defer_relog);
@@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
{
int j;
- seq_printf(m, "qm");
+ seq_puts(m, "qm");
for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9ac59814bbb6..f029c6702dda 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -653,7 +653,7 @@ xfs_fs_destroy_inode(
static void
xfs_fs_dirty_inode(
struct inode *inode,
- int flag)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -661,7 +661,13 @@ xfs_fs_dirty_inode(
if (!(inode->i_sb->s_flags & SB_LAZYTIME))
return;
- if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME))
+
+ /*
+ * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC)
+ * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed
+ * in flags possibly together with I_DIRTY_SYNC.
+ */
+ if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME))
return;
if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f9057af6e0c8..cb7c81ba7fa3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1170,7 +1170,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class,
__entry->ino_res_used = qtrx->qt_ino_res_used;
__entry->icount_delta = qtrx->qt_icount_delta;
),
- TP_printk("dev %d:%d dquot id 0x%x type %s flags %s"
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s "
"blk_res %llu bcount_delta %lld delbcnt_delta %lld "
"rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
"ino_res %llu ino_res_used %llu icount_delta %lld",
@@ -1602,7 +1602,7 @@ TRACE_EVENT(xfs_bunmap,
__entry->caller_ip = caller_ip;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx"
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
"flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d3a97a028560..16fbf2a1144c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -602,9 +602,9 @@ xfsaild(
while (1) {
if (tout && tout <= 20)
- set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
/*
* Check kthread_should_stop() after we set the task state to
@@ -653,14 +653,14 @@ xfsaild(
ailp->ail_target == ailp->ail_target_prev &&
list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
- freezable_schedule();
+ schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->ail_lock);
if (tout)
- freezable_schedule_timeout(msecs_to_jiffies(tout));
+ schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);