aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c23
-rw-r--r--fs/Kconfig22
-rw-r--r--fs/Kconfig.binfmt6
-rw-r--r--fs/adfs/inode.c10
-rw-r--r--fs/affs/file.c21
-rw-r--r--fs/affs/symlink.c5
-rw-r--r--fs/afs/dir.c7
-rw-r--r--fs/afs/file.c28
-rw-r--r--fs/afs/inode.c14
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/misc.c5
-rw-r--r--fs/afs/rotate.c4
-rw-r--r--fs/afs/rxrpc.c8
-rw-r--r--fs/afs/security.c3
-rw-r--r--fs/afs/write.c5
-rw-r--r--fs/befs/linuxvfs.c17
-rw-r--r--fs/bfs/file.c11
-rw-r--r--fs/binfmt_flat.c239
-rw-r--r--fs/btrfs/acl.c39
-rw-r--r--fs/btrfs/async-thread.c122
-rw-r--r--fs/btrfs/async-thread.h7
-rw-r--r--fs/btrfs/block-group.c205
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/btrfs_inode.h34
-rw-r--r--fs/btrfs/check-integrity.c172
-rw-r--r--fs/btrfs/check-integrity.h6
-rw-r--r--fs/btrfs/compression.c60
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c102
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delalloc-space.c9
-rw-r--r--fs/btrfs/delayed-inode.c84
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c52
-rw-r--r--fs/btrfs/dir-item.c31
-rw-r--r--fs/btrfs/disk-io.c344
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c69
-rw-r--r--fs/btrfs/extent_io.c634
-rw-r--r--fs/btrfs/extent_io.h47
-rw-r--r--fs/btrfs/file.c295
-rw-r--r--fs/btrfs/free-space-cache.c11
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/inode.c1893
-rw-r--r--fs/btrfs/ioctl.c289
-rw-r--r--fs/btrfs/props.c99
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c7
-rw-r--r--fs/btrfs/qgroup.h12
-rw-r--r--fs/btrfs/raid56.c809
-rw-r--r--fs/btrfs/raid56.h9
-rw-r--r--fs/btrfs/reflink.c23
-rw-r--r--fs/btrfs/relocation.c32
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c1889
-rw-r--r--fs/btrfs/send.c402
-rw-r--r--fs/btrfs/space-info.c11
-rw-r--r--fs/btrfs/space-info.h8
-rw-r--r--fs/btrfs/subpage.c55
-rw-r--r--fs/btrfs/subpage.h2
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/sysfs.c46
-rw-r--r--fs/btrfs/tests/btrfs-tests.c24
-rw-r--r--fs/btrfs/transaction.c116
-rw-r--r--fs/btrfs/tree-checker.c55
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/tree-log.c64
-rw-r--r--fs/btrfs/volumes.c142
-rw-r--r--fs/btrfs/volumes.h46
-rw-r--r--fs/btrfs/xattr.c51
-rw-r--r--fs/btrfs/zoned.c252
-rw-r--r--fs/btrfs/zoned.h23
-rw-r--r--fs/btrfs/zstd.c14
-rw-r--r--fs/buffer.c214
-rw-r--r--fs/cachefiles/Kconfig12
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/cachefiles/daemon.c117
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/internal.h78
-rw-r--r--fs/cachefiles/io.c76
-rw-r--r--fs/cachefiles/namei.c16
-rw-r--r--fs/cachefiles/ondemand.c503
-rw-r--r--fs/ceph/addr.c43
-rw-r--r--fs/ceph/caps.c7
-rw-r--r--fs/ceph/file.c16
-rw-r--r--fs/ceph/mds_client.c6
-rw-r--r--fs/cifs/file.c38
-rw-r--r--fs/cifs/smbdirect.c2
-rw-r--r--fs/coda/symlink.c7
-rw-r--r--fs/cramfs/README8
-rw-r--r--fs/cramfs/inode.c7
-rw-r--r--fs/crypto/crypto.c10
-rw-r--r--fs/crypto/fname.c11
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/inline_crypt.c33
-rw-r--r--fs/crypto/keyring.c64
-rw-r--r--fs/crypto/keysetup.c22
-rw-r--r--fs/crypto/policy.c132
-rw-r--r--fs/dax.c98
-rw-r--r--fs/direct-io.c32
-rw-r--r--fs/dlm/dir.c2
-rw-r--r--fs/dlm/dlm_internal.h66
-rw-r--r--fs/dlm/lock.c654
-rw-r--r--fs/dlm/lockspace.c12
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/lowcomms.c12
-rw-r--r--fs/dlm/member.c11
-rw-r--r--fs/dlm/midcomms.c61
-rw-r--r--fs/dlm/plock.c178
-rw-r--r--fs/dlm/rcom.c120
-rw-r--r--fs/dlm/recover.c49
-rw-r--r--fs/dlm/requestqueue.c20
-rw-r--r--fs/dlm/user.c16
-rw-r--r--fs/dlm/util.c92
-rw-r--r--fs/dlm/util.h8
-rw-r--r--fs/ecryptfs/mmap.c15
-rw-r--r--fs/efs/inode.c8
-rw-r--r--fs/efs/symlink.c5
-rw-r--r--fs/erofs/Kconfig10
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/data.c34
-rw-r--r--fs/erofs/decompressor.c7
-rw-r--r--fs/erofs/erofs_fs.h50
-rw-r--r--fs/erofs/fscache.c519
-rw-r--r--fs/erofs/inode.c11
-rw-r--r--fs/erofs/internal.h76
-rw-r--r--fs/erofs/namei.c5
-rw-r--r--fs/erofs/super.c237
-rw-r--r--fs/erofs/sysfs.c4
-rw-r--r--fs/erofs/zdata.c7
-rw-r--r--fs/exec.c6
-rw-r--r--fs/exfat/balloc.c8
-rw-r--r--fs/exfat/exfat_fs.h7
-rw-r--r--fs/exfat/fatent.c47
-rw-r--r--fs/exfat/file.c5
-rw-r--r--fs/exfat/inode.c10
-rw-r--r--fs/exfat/misc.c10
-rw-r--r--fs/exfat/namei.c27
-rw-r--r--fs/exfat/super.c19
-rw-r--r--fs/ext2/inode.c20
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/crypto.c246
-rw-r--r--fs/ext4/dir.c6
-rw-r--r--fs/ext4/ext4.h86
-rw-r--r--fs/ext4/extents.c20
-rw-r--r--fs/ext4/fast_commit.c13
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/inline.c59
-rw-r--r--fs/ext4/inode.c85
-rw-r--r--fs/ext4/ioctl.c69
-rw-r--r--fs/ext4/mballoc.c35
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c17
-rw-r--r--fs/ext4/namei.c214
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/super.c227
-rw-r--r--fs/ext4/symlink.c51
-rw-r--r--fs/ext4/verity.c9
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/data.c42
-rw-r--r--fs/f2fs/f2fs.h14
-rw-r--r--fs/f2fs/file.c23
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/f2fs/super.c2
-rw-r--r--fs/f2fs/verity.c9
-rw-r--r--fs/fat/file.c5
-rw-r--r--fs/fat/inode.c20
-rw-r--r--fs/fcntl.c9
-rw-r--r--fs/freevxfs/vxfs_immed.c15
-rw-r--r--fs/freevxfs/vxfs_subr.c17
-rw-r--r--fs/fs-writeback.c19
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/aops.c81
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/file.c143
-rw-r--r--fs/gfs2/glock.c35
-rw-r--r--fs/gfs2/glock.h12
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/meta_io.h8
-rw-r--r--fs/gfs2/quota.c62
-rw-r--r--fs/gfs2/recovery.c22
-rw-r--r--fs/gfs2/rgrp.c11
-rw-r--r--fs/hfs/extent.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c38
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c38
-rw-r--r--fs/hostfs/hostfs_kern.c9
-rw-r--r--fs/hpfs/file.c10
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/internal.h29
-rw-r--r--fs/io-wq.c4
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c3573
-rw-r--r--fs/iomap/buffered-io.c44
-rw-r--r--fs/iomap/direct-io.c35
-rw-r--r--fs/iomap/trace.h2
-rw-r--r--fs/isofs/compress.c5
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/isofs/rock.c7
-rw-r--r--fs/jbd2/commit.c14
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jbd2/transaction.c14
-rw-r--r--fs/jffs2/file.c23
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/gc.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/inode.c11
-rw-r--r--fs/jfs/ioctl.c5
-rw-r--r--fs/jfs/jfs_metapage.c21
-rw-r--r--fs/jfs/super.c8
-rw-r--r--fs/kernfs/dir.c7
-rw-r--r--fs/libfs.c18
-rw-r--r--fs/locks.c61
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mpage.c20
-rw-r--r--fs/namei.c28
-rw-r--r--fs/namespace.c5
-rw-r--r--fs/netfs/buffered_read.c25
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/direct.c23
-rw-r--r--fs/nfs/file.c69
-rw-r--r--fs/nfs/fs_context.c2
-rw-r--r--fs/nfs/fscache.h14
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/symlink.c16
-rw-r--r--fs/nfsd/filecache.c68
-rw-r--r--fs/nfsd/filecache.h2
-rw-r--r--fs/nfsd/nfs3proc.c141
-rw-r--r--fs/nfsd/nfs4proc.c264
-rw-r--r--fs/nfsd/nfs4state.c353
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c20
-rw-r--r--fs/nfsd/nfsd.h5
-rw-r--r--fs/nfsd/state.h31
-rw-r--r--fs/nfsd/trace.h34
-rw-r--r--fs/nfsd/vfs.c255
-rw-r--r--fs/nfsd/vfs.h14
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/inode.c27
-rw-r--r--fs/nilfs2/ioctl.c6
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/dnotify/dnotify.c13
-rw-r--r--fs/notify/fanotify/fanotify.c24
-rw-r--r--fs/notify/fanotify/fanotify.h12
-rw-r--r--fs/notify/fanotify/fanotify_user.c117
-rw-r--r--fs/notify/fdinfo.c21
-rw-r--r--fs/notify/fsnotify.c89
-rw-r--r--fs/notify/group.c32
-rw-r--r--fs/notify/inotify/inotify.h19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c47
-rw-r--r--fs/notify/mark.c112
-rw-r--r--fs/ntfs/aops.c40
-rw-r--r--fs/ntfs/aops.h6
-rw-r--r--fs/ntfs/attrib.c2
-rw-r--r--fs/ntfs/compress.c4
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ntfs/mft.h2
-rw-r--r--fs/ntfs3/file.c13
-rw-r--r--fs/ntfs3/inode.c27
-rw-r--r--fs/ntfs3/ntfs_fs.h5
-rw-r--r--fs/ntfs3/super.c10
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/symlink.c5
-rw-r--r--fs/omfs/file.c11
-rw-r--r--fs/open.c51
-rw-r--r--fs/orangefs/inode.c52
-rw-r--r--fs/overlayfs/file.c13
-rw-r--r--fs/pipe.c31
-rw-r--r--fs/proc/base.c22
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/proc_sysctl.c93
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/qnx4/inode.c7
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c36
-rw-r--r--fs/reiserfs/journal.c14
-rw-r--r--fs/romfs/super.c9
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/squashfs/block.c20
-rw-r--r--fs/squashfs/file.c5
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/squashfs/symlink.c5
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysv/itree.c10
-rw-r--r--fs/ubifs/file.c41
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/udf/file.c14
-rw-r--r--fs/udf/inode.c10
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/symlink.c5
-rw-r--r--fs/ufs/inode.c13
-rw-r--r--fs/userfaultfd.c32
-rw-r--r--fs/vboxsf/file.c5
-rw-r--r--fs/verity/Kconfig1
-rw-r--r--fs/verity/enable.c33
-rw-r--r--fs/verity/fsverity_private.h13
-rw-r--r--fs/verity/measure.c43
-rw-r--r--fs/verity/open.c12
-rw-r--r--fs/verity/read_metadata.c5
-rw-r--r--fs/xattr.c143
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1642
-rw-r--r--fs/xfs/libxfs/xfs_attr.h198
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c167
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h58
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_btree.c150
-rw-r--r--fs/xfs/libxfs/xfs_btree.h26
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h25
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c54
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c8
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h189
-rw-r--r--fs/xfs/libxfs/xfs_fs.h41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c118
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c51
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h76
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h79
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c75
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c161
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c80
-rw-r--r--fs/xfs/libxfs/xfs_shared.h24
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c225
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.h11
-rw-r--r--fs/xfs/scrub/bmap.c26
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/rtbitmap.c9
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c14
-rw-r--r--fs/xfs/xfs_attr_item.c824
-rw-r--r--fs/xfs/xfs_attr_item.h46
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_item.c27
-rw-r--r--fs/xfs/xfs_bmap_util.c27
-rw-r--r--fs/xfs/xfs_buf_item.h24
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_dquot.c18
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_error.h20
-rw-r--r--fs/xfs/xfs_extfree_item.c23
-rw-r--r--fs/xfs/xfs_file.c30
-rw-r--r--fs/xfs/xfs_filestream.c7
-rw-r--r--fs/xfs/xfs_fsmap.c6
-rw-r--r--fs/xfs/xfs_fsops.c7
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c9
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c80
-rw-r--r--fs/xfs/xfs_inode.h29
-rw-r--r--fs/xfs/xfs_inode_item.c48
-rw-r--r--fs/xfs/xfs_inode_item_recover.c145
-rw-r--r--fs/xfs/xfs_ioctl.c7
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c33
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_log.c807
-rw-r--r--fs/xfs/xfs_log.h90
-rw-r--r--fs/xfs/xfs_log_cil.c393
-rw-r--r--fs/xfs/xfs_log_priv.h89
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_message.c58
-rw-r--r--fs/xfs/xfs_message.h55
-rw-r--r--fs/xfs/xfs_mount.c91
-rw-r--r--fs/xfs/xfs_mount.h32
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c26
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c25
-rw-r--r--fs/xfs/xfs_reflink.c100
-rw-r--r--fs/xfs/xfs_rmap_item.c25
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c30
-rw-r--r--fs/xfs/xfs_symlink.c5
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h100
-rw-r--r--fs/xfs/xfs_trans.c52
-rw-r--r--fs/xfs/xfs_trans.h38
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/super.c197
-rw-r--r--fs/zonefs/sysfs.c139
-rw-r--r--fs/zonefs/zonefs.h18
433 files changed, 18081 insertions, 11810 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 501128188343..8ce82ff1e40a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -100,29 +100,28 @@ const struct netfs_request_ops v9fs_req_ops = {
};
/**
- * v9fs_release_page - release the private state associated with a page
- * @page: The page to be released
+ * v9fs_release_folio - release the private state associated with a folio
+ * @folio: The folio to be released
* @gfp: The caller's allocation restrictions
*
- * Returns 1 if the page can be released, false otherwise.
+ * Returns true if the page can be released, false otherwise.
*/
-static int v9fs_release_page(struct page *page, gfp_t gfp)
+static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
struct inode *inode = folio_inode(folio);
if (folio_test_private(folio))
- return 0;
+ return false;
#ifdef CONFIG_9P_FSCACHE
if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
- return 0;
+ return false;
folio_wait_fscache(folio);
}
#endif
fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
- return 1;
+ return true;
}
static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
@@ -260,7 +259,7 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
+ loff_t pos, unsigned int len,
struct page **subpagep, void **fsdata)
{
int retval;
@@ -275,7 +274,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata);
+ retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata);
if (retval < 0)
return retval;
@@ -336,13 +335,13 @@ static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
#endif
const struct address_space_operations v9fs_addr_operations = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
- .releasepage = v9fs_release_page,
+ .release_folio = v9fs_release_folio,
.invalidate_folio = v9fs_invalidate_folio,
.launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
diff --git a/fs/Kconfig b/fs/Kconfig
index 30b751c7f11a..5976eb33535f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -245,19 +245,27 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-config HUGETLB_PAGE_FREE_VMEMMAP
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of minimizing overhead of struct page associated with
+# each HugeTLB page.
+#
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ bool
+
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on X86_64
+ depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
- bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "Default optimizing vmemmap pages of HugeTLB to on"
default n
- depends on HUGETLB_PAGE_FREE_VMEMMAP
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
pages associated with each HugeTLB page is default off. Say Y here
- to enable freeing vmemmap pages of HugeTLB by default. It can then
+ to enable optimizing vmemmap pages of HugeTLB by default. It can then
be disabled on the command line via hugetlb_free_vmemmap=off.
config MEMFD_CREATE
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 21c6332fa785..32dff7ba3dda 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -142,12 +142,6 @@ config BINFMT_ZFLAT
help
Support FLAT format compressed binaries
-config BINFMT_SHARED_FLAT
- bool "Enable shared FLAT support"
- depends on BINFMT_FLAT
- help
- Support FLAT shared libraries
-
config HAVE_AOUT
def_bool n
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 561bc748c04a..ee22278b0cfc 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -38,9 +38,9 @@ static int adfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, adfs_get_block, wbc);
}
-static int adfs_readpage(struct file *file, struct page *page)
+static int adfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, adfs_get_block);
+ return block_read_full_folio(folio, adfs_get_block);
}
static void adfs_write_failed(struct address_space *mapping, loff_t to)
@@ -52,13 +52,13 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
}
static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -75,7 +75,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations adfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = adfs_readpage,
+ .read_folio = adfs_read_folio,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index b3f81d84ff4c..cd00a4c68a12 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -375,9 +375,9 @@ static int affs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, affs_get_block, wbc);
}
-static int affs_readpage(struct file *file, struct page *page)
+static int affs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, affs_get_block);
+ return block_read_full_folio(folio, affs_get_block);
}
static void affs_write_failed(struct address_space *mapping, loff_t to)
@@ -414,13 +414,13 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -455,7 +455,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations affs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = affs_readpage,
+ .read_folio = affs_read_folio,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
.write_end = affs_write_end,
@@ -629,8 +629,9 @@ out:
}
static int
-affs_readpage_ofs(struct file *file, struct page *page)
+affs_read_folio_ofs(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
u32 to;
int err;
@@ -650,7 +651,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
}
static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -670,7 +671,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -837,7 +838,7 @@ err_bh:
const struct address_space_operations affs_aops_ofs = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = affs_readpage_ofs,
+ .read_folio = affs_read_folio_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
.write_end = affs_write_end_ofs
@@ -887,7 +888,7 @@ affs_truncate(struct inode *inode)
loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata);
if (!res)
res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
else
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index a7531b26e8f0..31d6446dc166 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -11,8 +11,9 @@
#include "affs.h"
-static int affs_symlink_readpage(struct file *file, struct page *page)
+static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
char *link = page_address(page);
@@ -67,7 +68,7 @@ fail:
}
const struct address_space_operations affs_symlink_aops = {
- .readpage = affs_symlink_readpage,
+ .read_folio = affs_symlink_read_folio,
};
const struct inode_operations affs_symlink_inode_operations = {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 932e61e28e5d..94aa7356248e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -41,7 +41,7 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
-static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags);
static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
size_t length);
@@ -75,7 +75,7 @@ const struct inode_operations afs_dir_inode_operations = {
const struct address_space_operations afs_dir_aops = {
.dirty_folio = afs_dir_dirty_folio,
- .releasepage = afs_dir_releasepage,
+ .release_folio = afs_dir_release_folio,
.invalidate_folio = afs_dir_invalidate_folio,
};
@@ -2002,9 +2002,8 @@ error:
* Release a directory folio and clean up its private state if it's not busy
* - return true if the folio can now be released, false if not
*/
-static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 26292a110a8f..a8e8832179e4 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,10 +19,10 @@
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_symlink_readpage(struct file *file, struct page *page);
+static int afs_symlink_read_folio(struct file *file, struct folio *folio);
static void afs_invalidate_folio(struct folio *folio, size_t offset,
size_t length);
-static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static void afs_vm_open(struct vm_area_struct *area);
@@ -50,11 +50,11 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = afs_dirty_folio,
.launder_folio = afs_launder_folio,
- .releasepage = afs_releasepage,
+ .release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
@@ -63,8 +63,8 @@ const struct address_space_operations afs_file_aops = {
};
const struct address_space_operations afs_symlink_aops = {
- .readpage = afs_symlink_readpage,
- .releasepage = afs_releasepage,
+ .read_folio = afs_symlink_read_folio,
+ .release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
};
@@ -332,11 +332,10 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct file *file, struct page *page)
+static int afs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
struct afs_read *fsreq;
- struct folio *folio = page_folio(page);
int ret;
fsreq = afs_alloc_read(GFP_NOFS);
@@ -347,13 +346,13 @@ static int afs_symlink_readpage(struct file *file, struct page *page)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
if (ret == 0)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return ret;
}
@@ -482,16 +481,15 @@ static void afs_invalidate_folio(struct folio *folio, size_t offset,
* release a page and clean up its private state if it's not busy
* - return true if the page can now be released, false if not
*/
-static int afs_releasepage(struct page *page, gfp_t gfp)
+static bool afs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
gfp);
- /* deny if page is being written to the cache and the caller hasn't
+ /* deny if folio is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
if (folio_test_fscache(folio)) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 2fe402483ad5..30b066299d39 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (!(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7b7ef945dc78..a30995901266 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -311,7 +311,7 @@ struct afs_net {
atomic_t n_lookup; /* Number of lookups done */
atomic_t n_reval; /* Number of dentries needing revalidation */
atomic_t n_inval; /* Number of invalidations by the server */
- atomic_t n_relpg; /* Number of invalidations by releasepage */
+ atomic_t n_relpg; /* Number of invalidations by release_folio */
atomic_t n_read_dir; /* Number of directory pages read */
atomic_t n_dir_cr; /* Number of directory entry creation edits */
atomic_t n_dir_rm; /* Number of directory entry removal edits */
@@ -1535,7 +1535,7 @@ bool afs_dirty_folio(struct address_space *, struct folio *);
#define afs_dirty_folio filemap_dirty_folio
#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
extern int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 1d1a8debe472..933e67fcdab1 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -163,8 +163,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
return;
case -ECONNABORTED:
+ error = afs_abort_to_error(abort_code);
+ fallthrough;
+ case -ENETRESET: /* Responded, but we seem to have changed address */
e->responded = true;
- e->error = afs_abort_to_error(abort_code);
+ e->error = error;
return;
}
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 79e1a5f6701b..a840c3588ebb 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op)
op->error = error;
goto iterate_address;
+ case -ENETRESET:
+ pr_warn("kAFS: Peer reset %s (op=%x)\n",
+ op->type ? op->type->name : "???", op->debug_id);
+ fallthrough;
case -ECONNRESET:
_debug("call reset");
op->error = error;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 23a1a92d64bb..a5434f3e57c6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -537,6 +537,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
+ case -ENOMEM:
+ case -EFAULT:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -544,7 +546,7 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code, ret, "KUM");
goto local_abort;
default:
- abort_code = RX_USER_ABORT;
+ abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KER");
goto local_abort;
@@ -836,7 +838,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
fallthrough;
default:
_leave(" [error]");
@@ -878,7 +880,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3c7a8fc4f93f..7c6a63a30394 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
* yet.
*/
size++;
- new = kzalloc(sizeof(struct afs_permits) +
- sizeof(struct afs_permit) * size, GFP_NOFS);
+ new = kzalloc(struct_size(new, permits, size), GFP_NOFS);
if (!new)
goto out_put;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4763132ca57e..2236b2165e37 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -42,7 +42,7 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio)
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
@@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata);
+ ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata);
if (ret < 0)
return ret;
@@ -636,6 +636,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
+ case -ENETRESET:
afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b4b3567ac655..be383fa46b12 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -40,7 +40,7 @@ MODULE_LICENSE("GPL");
static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-static int befs_readpage(struct file *file, struct page *page);
+static int befs_read_folio(struct file *file, struct folio *folio);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
static struct dentry *befs_lookup(struct inode *, struct dentry *,
unsigned int);
@@ -48,7 +48,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_free_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static int befs_symlink_readpage(struct file *, struct page *);
+static int befs_symlink_read_folio(struct file *, struct folio *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -87,12 +87,12 @@ static const struct inode_operations befs_dir_inode_operations = {
};
static const struct address_space_operations befs_aops = {
- .readpage = befs_readpage,
+ .read_folio = befs_read_folio,
.bmap = befs_bmap,
};
static const struct address_space_operations befs_symlink_aops = {
- .readpage = befs_symlink_readpage,
+ .read_folio = befs_symlink_read_folio,
};
static const struct export_operations befs_export_operations = {
@@ -102,16 +102,16 @@ static const struct export_operations befs_export_operations = {
};
/*
- * Called by generic_file_read() to read a page of data
+ * Called by generic_file_read() to read a folio of data
*
* In turn, simply calls a generic block read function and
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
*/
static int
-befs_readpage(struct file *file, struct page *page)
+befs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, befs_get_block);
+ return block_read_full_folio(folio, befs_get_block);
}
static sector_t
@@ -468,8 +468,9 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static int befs_symlink_readpage(struct file *unused, struct page *page)
+static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct befs_inode_info *befs_ino = BEFS_I(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 03139344568f..57ae5ee6deec 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -155,9 +155,9 @@ static int bfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, bfs_get_block, wbc);
}
-static int bfs_readpage(struct file *file, struct page *page)
+static int bfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, bfs_get_block);
+ return block_read_full_folio(folio, bfs_get_block);
}
static void bfs_write_failed(struct address_space *mapping, loff_t to)
@@ -169,13 +169,12 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
}
static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- bfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block);
if (unlikely(ret))
bfs_write_failed(mapping, pos + len);
@@ -190,7 +189,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations bfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = bfs_readpage,
+ .read_folio = bfs_read_folio,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 626898150011..c26545d71d39 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,7 +37,6 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
-#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -69,11 +68,7 @@
#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-#define MAX_SHARED_LIBS (4)
-#else
-#define MAX_SHARED_LIBS (1)
-#endif
+#define MAX_SHARED_LIBS (1)
#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
#define DATA_START_OFFSET_WORDS (0)
@@ -93,38 +88,13 @@ struct lib_info {
} lib_list[MAX_SHARED_LIBS];
};
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-static int load_flat_shared_library(int id, struct lib_info *p);
-#endif
-
static int load_flat_binary(struct linux_binprm *);
-#ifdef CONFIG_COREDUMP
-static int flat_core_dump(struct coredump_params *cprm);
-#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
-#ifdef CONFIG_COREDUMP
- .core_dump = flat_core_dump,
- .min_coredump = PAGE_SIZE
-#endif
};
-/****************************************************************************/
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- */
-
-#ifdef CONFIG_COREDUMP
-static int flat_core_dump(struct coredump_params *cprm)
-{
- pr_warn("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, cprm->siginfo->si_signo);
- return 1;
-}
-#endif
/****************************************************************************/
/*
@@ -329,51 +299,18 @@ out_free:
/****************************************************************************/
static unsigned long
-calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
+calc_reloc(unsigned long r, struct lib_info *p)
{
unsigned long addr;
- int id;
unsigned long start_brk;
unsigned long start_data;
unsigned long text_len;
unsigned long start_code;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- if (r == 0)
- id = curid; /* Relocs of 0 are always self referring */
- else {
- id = (r >> 24) & 0xff; /* Find ID for this reloc */
- r &= 0x00ffffff; /* Trim ID off here */
- }
- if (id >= MAX_SHARED_LIBS) {
- pr_err("reference 0x%lx to shared library %d", r, id);
- goto failed;
- }
- if (curid != id) {
- if (internalp) {
- pr_err("reloc address 0x%lx not in same module "
- "(%d != %d)", r, curid, id);
- goto failed;
- } else if (!p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) < 0) {
- pr_err("failed to load library %d", id);
- goto failed;
- }
- /* Check versioning information (i.e. time stamps) */
- if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
- p->lib_list[curid].build_date < p->lib_list[id].build_date) {
- pr_err("library %d is younger than %d", id, curid);
- goto failed;
- }
- }
-#else
- id = 0;
-#endif
-
- start_brk = p->lib_list[id].start_brk;
- start_data = p->lib_list[id].start_data;
- start_code = p->lib_list[id].start_code;
- text_len = p->lib_list[id].text_len;
+ start_brk = p->lib_list[0].start_brk;
+ start_data = p->lib_list[0].start_data;
+ start_code = p->lib_list[0].start_code;
+ text_len = p->lib_list[0].text_len;
if (r > start_brk - start_data + text_len) {
pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
@@ -440,8 +377,32 @@ static void old_reloc(unsigned long rl)
/****************************************************************************/
+static inline u32 __user *skip_got_header(u32 __user *rp)
+{
+ if (IS_ENABLED(CONFIG_RISCV)) {
+ /*
+ * RISC-V has a 16 byte GOT PLT header for elf64-riscv
+ * and 8 byte GOT PLT header for elf32-riscv.
+ * Skip the whole GOT PLT header, since it is reserved
+ * for the dynamic linker (ld.so).
+ */
+ u32 rp_val0, rp_val1;
+
+ if (get_user(rp_val0, rp))
+ return rp;
+ if (get_user(rp_val1, rp + 1))
+ return rp;
+
+ if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff)
+ rp += 4;
+ else if (rp_val0 == 0xffffffff)
+ rp += 2;
+ }
+ return rp;
+}
+
static int load_flat_file(struct linux_binprm *bprm,
- struct lib_info *libinfo, int id, unsigned long *extra_stack)
+ struct lib_info *libinfo, unsigned long *extra_stack)
{
struct flat_hdr *hdr;
unsigned long textpos, datapos, realdatastart;
@@ -493,14 +454,6 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- /* Don't allow old format executables to use shared libraries */
- if (rev == OLD_FLAT_VERSION && id != 0) {
- pr_err("shared libraries are not available before rev 0x%lx\n",
- FLAT_VERSION);
- ret = -ENOEXEC;
- goto err;
- }
-
/*
* fix up the flags for the older format, there were all kinds
* of endian hacks, this only works for the simple cases
@@ -551,15 +504,13 @@ static int load_flat_file(struct linux_binprm *bprm,
}
/* Flush all traces of the currently running executable */
- if (id == 0) {
- ret = begin_new_exec(bprm);
- if (ret)
- goto err;
+ ret = begin_new_exec(bprm);
+ if (ret)
+ goto err;
- /* OK, This is the point of no return */
- set_personality(PER_LINUX_32BIT);
- setup_new_exec(bprm);
- }
+ /* OK, This is the point of no return */
+ set_personality(PER_LINUX_32BIT);
+ setup_new_exec(bprm);
/*
* calculate the extra space we need to map in
@@ -739,42 +690,40 @@ static int load_flat_file(struct linux_binprm *bprm,
text_len -= sizeof(struct flat_hdr); /* the real code len */
/* The main program needs a little extra setup in the task structure */
- if (id == 0) {
- current->mm->start_code = start_code;
- current->mm->end_code = end_code;
- current->mm->start_data = datapos;
- current->mm->end_data = datapos + data_len;
- /*
- * set up the brk stuff, uses any slack left in data/bss/stack
- * allocation. We put the brk after the bss (between the bss
- * and stack) like other platforms.
- * Userspace code relies on the stack pointer starting out at
- * an address right at the end of a page.
- */
- current->mm->start_brk = datapos + data_len + bss_len;
- current->mm->brk = (current->mm->start_brk + 3) & ~3;
+ current->mm->start_code = start_code;
+ current->mm->end_code = end_code;
+ current->mm->start_data = datapos;
+ current->mm->end_data = datapos + data_len;
+ /*
+ * set up the brk stuff, uses any slack left in data/bss/stack
+ * allocation. We put the brk after the bss (between the bss
+ * and stack) like other platforms.
+ * Userspace code relies on the stack pointer starting out at
+ * an address right at the end of a page.
+ */
+ current->mm->start_brk = datapos + data_len + bss_len;
+ current->mm->brk = (current->mm->start_brk + 3) & ~3;
#ifndef CONFIG_MMU
- current->mm->context.end_brk = memp + memp_size - stack_len;
+ current->mm->context.end_brk = memp + memp_size - stack_len;
#endif
- }
if (flags & FLAT_FLAG_KTRACE) {
pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
- id ? "Lib" : "Load", bprm->filename,
+ "Load", bprm->filename,
start_code, end_code, datapos, datapos + data_len,
datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
}
/* Store the current module values into the global library structure */
- libinfo->lib_list[id].start_code = start_code;
- libinfo->lib_list[id].start_data = datapos;
- libinfo->lib_list[id].start_brk = datapos + data_len + bss_len;
- libinfo->lib_list[id].text_len = text_len;
- libinfo->lib_list[id].loaded = 1;
- libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
- libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
+ libinfo->lib_list[0].start_code = start_code;
+ libinfo->lib_list[0].start_data = datapos;
+ libinfo->lib_list[0].start_brk = datapos + data_len + bss_len;
+ libinfo->lib_list[0].text_len = text_len;
+ libinfo->lib_list[0].loaded = 1;
+ libinfo->lib_list[0].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
+ libinfo->lib_list[0].build_date = ntohl(hdr->build_date);
/*
* We just load the allocations into some temporary memory to
@@ -789,14 +738,15 @@ static int load_flat_file(struct linux_binprm *bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (u32 __user *)datapos; ; rp++) {
+ rp = skip_got_header((u32 __user *) datapos);
+ for (; ; rp++) {
u32 addr, rp_val;
if (get_user(rp_val, rp))
return -EFAULT;
if (rp_val == 0xffffffff)
break;
if (rp_val) {
- addr = calc_reloc(rp_val, libinfo, id, 0);
+ addr = calc_reloc(rp_val, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -832,7 +782,7 @@ static int load_flat_file(struct linux_binprm *bprm,
return -EFAULT;
relval = ntohl(tmp);
addr = flat_get_relocate_addr(relval);
- rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1);
+ rp = (u32 __user *)calc_reloc(addr, libinfo);
if (rp == (u32 __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -855,7 +805,7 @@ static int load_flat_file(struct linux_binprm *bprm,
*/
addr = ntohl((__force __be32)addr);
}
- addr = calc_reloc(addr, libinfo, id, 0);
+ addr = calc_reloc(addr, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -883,7 +833,7 @@ static int load_flat_file(struct linux_binprm *bprm,
/* zero the BSS, BRK and stack areas */
if (clear_user((void __user *)(datapos + data_len), bss_len +
(memp + memp_size - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
+ libinfo->lib_list[0].start_brk) + /* start brk */
stack_len))
return -EFAULT;
@@ -894,49 +844,6 @@ err:
/****************************************************************************/
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-
-/*
- * Load a shared library into memory. The library gets its own data
- * segment (including bss) but not argv/argc/environ.
- */
-
-static int load_flat_shared_library(int id, struct lib_info *libs)
-{
- /*
- * This is a fake bprm struct; only the members "buf", "file" and
- * "filename" are actually used.
- */
- struct linux_binprm bprm;
- int res;
- char buf[16];
- loff_t pos = 0;
-
- memset(&bprm, 0, sizeof(bprm));
-
- /* Create the file name */
- sprintf(buf, "/lib/lib%d.so", id);
-
- /* Open the file up */
- bprm.filename = buf;
- bprm.file = open_exec(bprm.filename);
- res = PTR_ERR(bprm.file);
- if (IS_ERR(bprm.file))
- return res;
-
- res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
-
- if (res >= 0)
- res = load_flat_file(&bprm, libs, id, NULL);
-
- allow_write_access(bprm.file);
- fput(bprm.file);
-
- return res;
-}
-
-#endif /* CONFIG_BINFMT_SHARED_FLAT */
-/****************************************************************************/
/*
* These are the functions used to load flat style executables and shared
@@ -968,7 +875,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
- res = load_flat_file(bprm, &libinfo, 0, &stack_len);
+ res = load_flat_file(bprm, &libinfo, &stack_len);
if (res < 0)
return res;
@@ -1013,20 +920,6 @@ static int load_flat_binary(struct linux_binprm *bprm)
*/
start_addr = libinfo.lib_list[0].entry;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
- if (libinfo.lib_list[i].loaded) {
- /* Push previos first to call address */
- unsigned long __user *sp;
- current->mm->start_stack -= sizeof(unsigned long);
- sp = (unsigned long __user *)current->mm->start_stack;
- if (put_user(start_addr, sp))
- return -EFAULT;
- start_addr = libinfo.lib_list[i].entry;
- }
- }
-#endif
-
#ifdef FLAT_PLAT_INIT
FLAT_PLAT_INIT(regs);
#endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0a0d0eccee4e..548d6a5477b4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type)
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int ret = 0;
-
- /* this happens with subvols */
- if (!dir)
- return 0;
-
- ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (ret)
- return ret;
-
- if (default_acl) {
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!ret)
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
-
- if (!default_acl && !acl)
- cache_no_acl(inode);
- return ret;
-}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 43c89952b7d2..aac240430efe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
- WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
-struct __btrfs_workqueue {
+struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
-struct btrfs_workqueue {
- struct __btrfs_workqueue *normal;
- struct __btrfs_workqueue *high;
-};
-
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
- * We could compare wq->normal->pending with num_online_cpus()
+ * We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
- if (wq->normal->thresh == NO_THRESHOLD)
+ if (wq->thresh == NO_THRESHOLD)
return false;
- return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+ return atomic_read(&wq->pending) > wq->thresh * 2;
}
-static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
- unsigned int flags, int limit_active, int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
- if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
- ret->current_active, name);
- else
- ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
- ret->current_active, name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
- trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
- return ret;
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-
-struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
- const char *name,
- unsigned int flags,
- int limit_active,
- int thresh)
-{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
-
- if (!ret)
- return NULL;
-
- ret->normal = __btrfs_alloc_workqueue(fs_info, name,
- flags & ~WQ_HIGHPRI,
- limit_active, thresh);
- if (!ret->normal) {
- kfree(ret);
- return NULL;
- }
-
- if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
- limit_active, thresh);
- if (!ret->high) {
- __btrfs_destroy_workqueue(ret->normal);
- kfree(ret);
- return NULL;
- }
- }
+ trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
-static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
-static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@@ -217,7 +173,7 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq,
+static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
- wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
-static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- struct btrfs_work *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
-void btrfs_queue_work(struct btrfs_workqueue *wq,
- struct btrfs_work *work)
-{
- struct __btrfs_workqueue *dest_wq;
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
- dest_wq = wq->high;
- else
- dest_wq = wq->normal;
- __btrfs_queue_work(dest_wq, work);
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
-{
- destroy_workqueue(wq->normal_wq);
- trace_btrfs_workqueue_destroy(wq);
- kfree(wq);
-}
-
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
- if (wq->high)
- __btrfs_destroy_workqueue(wq->high);
- __btrfs_destroy_workqueue(wq->normal);
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
- if (!wq)
- return;
- wq->normal->limit_active = limit_active;
- if (wq->high)
- wq->high->limit_active = limit_active;
-}
-
-void btrfs_set_work_high_priority(struct btrfs_work *work)
-{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (wq)
+ wq->limit_active = limit_active;
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
- if (wq->high)
- flush_workqueue(wq->high->normal_wq);
-
- flush_workqueue(wq->normal->normal_wq);
+ flush_workqueue(wq->normal_wq);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3204daa51b95..07960529b360 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -11,8 +11,6 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
-/* Internal use only */
-struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_work_func_t)(struct work_struct *arg);
@@ -25,7 +23,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq;
unsigned long flags;
};
@@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0dd6de994199..ede389f2602d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
+ bool leftmost = true;
ASSERT(block_group->length != 0);
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
-
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return 0;
}
@@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
@@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1522,6 +1555,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
@@ -1692,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
- struct extent_buffer *leaf;
- int slot;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = read_bg_from_eb(fs_info, &found_key, path);
- break;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
-
- path->slots[0]++;
}
-out:
return ret;
}
@@ -3220,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3242,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -3287,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3313,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@@ -3957,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3994,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->zone_active_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -4024,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index e8308f2ad07d..3ac668ace50a 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -212,6 +212,8 @@ struct btrfs_block_group {
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -254,8 +256,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 47e72d72f7d0..33811e896623 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -384,30 +384,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
return ret;
}
-struct btrfs_dio_private {
- struct inode *inode;
-
- /*
- * Since DIO can use anonymous page, we cannot use page_offset() to
- * grab the file offset, thus need a dedicated member for file offset.
- */
- u64 file_offset;
- u64 disk_bytenr;
- /* Used for bio::bi_size */
- u32 bytes;
-
- /*
- * References to this structure. There is one reference per in-flight
- * bio plus one while we're still setting up.
- */
- refcount_t refs;
-
- /* dio_bio came from fs/direct-io.c */
- struct bio *dio_bio;
-
- /* Array of checksums */
- u8 csums[];
-};
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index abac86a75840..5d20137b7b67 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- for (i = 0; i < num_pages; i++) {
- block_ctx->pagev[i] = alloc_page(GFP_NOFS);
- if (!block_ctx->pagev[i])
- return -1;
- }
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- bio = btrfs_bio_alloc(num_pages - i);
- bio_set_dev(bio, block_ctx->dev->bdev);
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
- bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -2033,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
- struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@@ -2635,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-static void __btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- struct btrfsic_dev_state *dev_state;
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
- if (!btrfsic_is_initialized)
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
return;
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bio() is also called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- int i = 0;
- u64 dev_bytenr;
- u64 cur_bytenr;
- struct bio_vec bvec;
- struct bvec_iter iter;
- int bio_is_patched;
- char **mapped_datav;
- unsigned int segs = bio_segments(bio);
-
- dev_bytenr = 512 * bio->bi_iter.bi_sector;
- bio_is_patched = 0;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs,
- sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- goto leave;
- cur_bytenr = dev_bytenr;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, segs,
- bio, &bio_is_patched,
- bio->bi_opf);
- kfree(mapped_datav);
- } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- }
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
-leave:
- mutex_unlock(&btrfsic_mutex);
+
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
}
-void btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- __btrfsic_submit_bio(bio);
- submit_bio(bio);
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
+
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
+ }
}
-int btrfsic_submit_bio_wait(struct bio *bio)
+void btrfsic_check_bio(struct bio *bio)
{
- __btrfsic_submit_bio(bio);
- return submit_bio_wait(bio);
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+ mutex_lock(&btrfsic_mutex);
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
+ }
+ mutex_unlock(&btrfsic_mutex);
}
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index bcc730a06cb5..e4c8aed7996f 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_submit_bio(struct bio *bio);
-int btrfsic_submit_bio_wait(struct bio *bio);
+void btrfsic_check_bio(struct bio *bio);
#else
-#define btrfsic_submit_bio submit_bio
-#define btrfsic_submit_bio_wait submit_bio_wait
+static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19bf36d8ffea..f4564f32f6d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -425,7 +425,6 @@ out:
}
static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct compressed_bio *cb,
struct bio *bio, int mirror_num)
{
blk_status_t ret;
@@ -604,7 +603,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
goto finish_cb;
}
- ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ ret = submit_compressed_bio(fs_info, bio, 0);
if (ret)
goto finish_cb;
bio = NULL;
@@ -802,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- unsigned int nr_pages;
- unsigned int pg_index;
struct bio *comp_bio = NULL;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
@@ -820,7 +817,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
blk_status_t ret;
- int faili = 0;
+ int ret2;
+ int i;
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -855,32 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_len = em->len;
em_start = em->start;
- free_extent_map(em);
- em = NULL;
-
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_compress_type(bio_flags);
+ cb->compress_type = em->compress_type;
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_NOFS);
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
if (!cb->compressed_pages) {
ret = BLK_STS_RESOURCE;
- goto fail1;
+ goto fail;
}
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
- if (!cb->compressed_pages[pg_index]) {
- faili = pg_index - 1;
- ret = BLK_STS_RESOURCE;
- goto fail2;
- }
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
}
- faili = nr_pages - 1;
- cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -949,28 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
comp_bio = NULL;
}
}
- return BLK_STS_OK;
+ return;
-fail2:
- while (faili >= 0) {
- __free_page(cb->compressed_pages[faili]);
- faili--;
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
}
kfree(cb->compressed_pages);
-fail1:
kfree(cb);
out:
free_extent_map(em);
bio->bi_status = ret;
bio_endio(bio);
- return ret;
+ return;
finish_cb:
if (comp_bio) {
comp_bio->bi_status = ret;
@@ -978,7 +971,7 @@ finish_cb:
}
/* All bytes of @cb is submitted, endio will free @cb */
if (cur_disk_byte == disk_bytenr + compressed_len)
- return ret;
+ return;
wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
(disk_bytenr + compressed_len - cur_disk_byte) >>
@@ -990,7 +983,6 @@ finish_cb:
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
finish_compressed_bio_read(cb);
- return ret;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ac5b20731d2a..2707404389a5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -102,8 +102,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback);
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0eecf98d0abb..6e556031a8f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
- new_flags, level, 0);
+ new_flags, level);
if (ret)
return ret;
}
@@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
@@ -1436,30 +1447,38 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
+
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EIO;
}
- *eb_ret = tmp;
- return 0;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
if (IS_ERR(tmp)) {
@@ -1474,9 +1493,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
*/
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
+ } else {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ }
+
return ret;
}
@@ -2279,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 077c95e9baa5..0e49b1a0c071 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,13 +675,13 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
- spinlock_t fs_roots_radix_lock;
- struct radix_tree_root fs_roots_radix;
+ /* The xarray that holds all the FS roots */
+ spinlock_t fs_roots_lock;
+ struct xarray fs_roots;
/* block group cache stuff */
- spinlock_t block_group_cache_lock;
- u64 first_logical_byte;
- struct rb_root block_group_cache_tree;
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
@@ -848,12 +848,13 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *rmw_workers;
+ struct workqueue_struct *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
@@ -946,9 +947,9 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- struct btrfs_workqueue *scrub_workers;
- struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_parity_workers;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -994,10 +995,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer radix tree */
+ /* Extent buffer xarray */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ struct xarray extent_buffers;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1045,10 +1046,7 @@ struct btrfs_fs_info {
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
- union {
- u64 zone_size;
- u64 zoned;
- };
+ u64 zone_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
@@ -1121,7 +1119,8 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- BTRFS_ROOT_IN_RADIX,
+ /* The root is tracked in fs_info::fs_roots */
+ BTRFS_ROOT_REGISTERED,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1225,10 +1224,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by inode_lock
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -2784,7 +2783,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict);
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2811,8 +2811,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags,
- int level, int is_data);
+ struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
@@ -2892,7 +2891,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes);
+ u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -3039,6 +3038,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -3190,7 +3218,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* file-item.c */
-struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
@@ -3224,8 +3251,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
@@ -3255,10 +3282,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3269,7 +3314,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -3314,9 +3358,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+
extern const struct dentry_operations btrfs_dentry_operations;
-extern const struct iomap_ops btrfs_dio_iomap_ops;
-extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
@@ -3328,6 +3372,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3403,11 +3448,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
-#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
__printf(2, 3)
__cold
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
#else
+
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@@ -3658,12 +3721,25 @@ do { \
__LINE__, (errno)); \
} while (0)
+#ifdef CONFIG_PRINTK_INDEX
+
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
} while (0)
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
@@ -3816,15 +3892,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
-static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return 0;
+ return -EOPNOTSUPP;
}
#endif
@@ -3929,7 +4006,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
- return fs_info->zoned != 0;
+ return fs_info->zone_size > 0;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bd8267c4687d..36ab0859a263 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes)
+ u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
- if (btrfs_is_free_space_inode(inode)) {
+ if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 748bf6b0d860..66779ab3ed4a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
-again:
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+ do {
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* Cached in the inode and can be accessed */
+ refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
- }
-
- spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
- goto again;
- }
+ spin_lock(&root->inode_lock);
+ ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
+ if (ret) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ if (ret != -EBUSY)
+ return ERR_PTR(ret);
+ }
+ } while (ret);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ int n = 0;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&delayed_node->refs)) {
+ delayed_nodes[n] = delayed_node;
+ n++;
+ }
+ if (n >= ARRAY_SIZE(delayed_nodes))
+ break;
}
+ index++;
spin_unlock(&root->inode_lock);
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < n; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4176df149d04..99f37fca2e96 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
- false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 91a3aabad150..d6304b690ec4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
- bool is_data;
u64 flags_to_set;
};
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f26202621989..a7dd6ba25e99 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
int ret = 0;
u64 chunk_offset;
@@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- if (path->slots[0] >=
- btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- ret = 0;
- goto free_path;
- }
- } else {
- ret = 0;
- }
- }
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
@@ -557,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (found_key.offset < key.offset)
break;
- dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
- goto skip;
+ continue;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
-
-skip:
- ret = btrfs_next_item(root, path);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- break;
- }
}
+ if (iter_ret < 0)
+ ret = iter_ret;
-free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
@@ -881,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@@ -930,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@@ -972,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@@ -1001,8 +975,8 @@ error:
btrfs_assign_next_active_device(src_device, tgt_device);
- list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
- fs_info->fs_devices->rw_devices++;
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -1025,7 +999,7 @@ error:
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 3b532bab0755..72fb2c518a2b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ERR_PTR(ret);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
-
- path->slots[0]++;
}
- return NULL;
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed8e288cc369..89e94ea2fef5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,7 +5,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@@ -486,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from buffer_radix */
+ /* A dirty eb shouldn't disappear from extent_buffers */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -519,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
u64 found_start;
struct extent_buffer *eb;
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
@@ -704,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
@@ -850,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work)
}
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
+ int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -874,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
async->status = 0;
if (op_is_sync(bio->bi_opf))
- btrfs_set_work_high_priority(&async->work);
-
- btrfs_queue_work(fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
return 0;
}
@@ -920,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
@@ -933,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
*/
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_METADATA);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- 0, btree_submit_bio_start);
+ btree_submit_bio_start);
}
- if (ret)
- goto out_w_error;
- return 0;
-
-out_w_error:
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
}
#ifdef CONFIG_MIGRATION
@@ -1005,12 +996,12 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
- return try_release_extent_buffer(page);
+ return try_release_extent_buffer(&folio->page);
}
static void btree_invalidate_folio(struct folio *folio, size_t offset,
@@ -1019,7 +1010,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
struct extent_io_tree *tree;
tree = &BTRFS_I(folio->mapping->host)->io_tree;
extent_invalidate_folio(tree, folio, offset);
- btree_releasepage(&folio->page, GFP_NOFS);
+ btree_release_folio(folio, GFP_NOFS);
if (folio_get_private(folio)) {
btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
"folio private not zero on folio %llu",
@@ -1080,7 +1071,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
- .releasepage = btree_releasepage,
+ .release_folio = btree_release_folio,
.invalidate_folio = btree_invalidate_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
@@ -1118,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
return buf;
}
@@ -1164,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -1216,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
#endif
}
@@ -1563,6 +1557,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = -EIO;
goto fail;
}
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
+ }
root->commit_root = btrfs_root_node(root);
return root;
fail:
@@ -1648,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_lock);
+ root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return root;
}
@@ -1695,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
- return ret;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
+ spin_lock(&fs_info->fs_roots_lock);
+ ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
+ root, GFP_NOFS);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ set_bit(BTRFS_ROOT_REGISTERED, &root->state);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ spin_unlock(&fs_info->fs_roots_lock);
return ret;
}
@@ -1964,7 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
+ struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@@ -2266,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2336,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_radix_lock);
+ spin_lock(&root->fs_info->fs_roots_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ spin_unlock(&root->fs_info->fs_roots_lock);
#endif
kfree(root);
}
@@ -2346,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct btrfs_root *gang[8];
- int i;
+ struct btrfs_root *root;
+ unsigned long index = 0;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&gang[0]->root_list);
+ root = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&root->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
- btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- btrfs_put_root(gang[0]);
+ if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ btrfs_put_root(root);
}
- while (1) {
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang));
- if (!ret)
- break;
- for (i = 0; i < ret; i++)
- btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ xa_for_each(&fs_info->fs_roots, index, root) {
+ btrfs_drop_and_free_fs_root(fs_info, root);
}
}
@@ -2444,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue(fs_info, "worker",
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@@ -2476,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
@@ -2492,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->flush_workers &&
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
@@ -2815,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info,
}
/*
- * For 4K page size, we only support 4K sector size.
- * For 64K page size, we support 64K and 4K sector sizes.
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
*/
- if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
- (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
- sectorsize != SZ_64K))) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -3132,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
+ xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -3141,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->fs_roots_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -3209,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
- spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree = RB_ROOT;
- fs_info->first_logical_byte = (u64)-1;
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@@ -3295,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
static int btrfs_uuid_rescan_kthread(void *data)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ struct btrfs_fs_info *fs_info = data;
int ret;
/*
@@ -3373,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots_radix tree. This must be done before
+ * them into the fs_info->fs_roots. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -3611,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3649,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3658,17 +3659,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- if (btrfs_super_incompat_flags(fs_info->super_copy) &
- BTRFS_FEATURE_INCOMPAT_RAID56) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
- err = -EINVAL;
- goto fail_alloc;
- }
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info)
goto fail_alloc;
@@ -4146,7 +4150,8 @@ static int write_dev_supers(struct btrfs_device *device,
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
errors++;
@@ -4227,6 +4232,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
+ bio_uninit(bio);
complete(bio->bi_private);
}
@@ -4236,7 +4242,7 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
@@ -4249,17 +4255,18 @@ static void write_dev_flush(struct btrfs_device *device)
* of simplicity, since this is a debug tool and not meant for use in
* non-debug builds.
*/
- struct request_queue *q = bdev_get_queue(device->bdev);
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ if (!bdev_write_cache(device->bdev))
return;
#endif
- bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
+ bio_init(bio, device->bdev, NULL, 0,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4268,7 +4275,7 @@ static void write_dev_flush(struct btrfs_device *device)
*/
static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
@@ -4492,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
- spin_lock(&fs_info->fs_roots_radix_lock);
- radix_tree_delete(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid);
- if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ spin_lock(&fs_info->fs_roots_lock);
+ xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
drop_ref = true;
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@@ -4513,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
- u64 root_objectid = 0;
- struct btrfs_root *gang[8];
- int i = 0;
+ struct btrfs_root *roots[8];
+ unsigned long index = 0;
+ int i;
int err = 0;
- unsigned int ret = 0;
+ int grabbed;
while (1) {
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, root_objectid,
- ARRAY_SIZE(gang));
- if (!ret) {
- spin_unlock(&fs_info->fs_roots_radix_lock);
- break;
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_lock);
+ if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
+ spin_unlock(&fs_info->fs_roots_lock);
+ return err;
}
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
- for (i = 0; i < ret; i++) {
- /* Avoid to grab roots in dead_roots */
- if (btrfs_root_refs(&gang[i]->root_item) == 0) {
- gang[i] = NULL;
- continue;
- }
- /* grab all the search result for later use */
- gang[i] = btrfs_grab_root(gang[i]);
+ grabbed = 0;
+ xa_for_each_start(&fs_info->fs_roots, index, root, index) {
+ /* Avoid grabbing roots in dead_roots */
+ if (btrfs_root_refs(&root->root_item) > 0)
+ roots[grabbed++] = btrfs_grab_root(root);
+ if (grabbed >= ARRAY_SIZE(roots))
+ break;
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
- for (i = 0; i < ret; i++) {
- if (!gang[i])
+ for (i = 0; i < grabbed; i++) {
+ if (!roots[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(gang[i]);
+ index = roots[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(roots[i]);
if (err)
- break;
- btrfs_put_root(gang[i]);
+ goto out;
+ btrfs_put_root(roots[i]);
}
- root_objectid++;
+ index++;
}
- /* release the uncleaned roots due to error */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
+out:
+ /* Release the roots that remain uncleaned due to error */
+ for (; i < grabbed; i++) {
+ if (roots[i])
+ btrfs_put_root(roots[i]);
}
return err;
}
@@ -4851,13 +4855,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key)
-{
- return btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
@@ -4873,31 +4870,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *gang[8];
- u64 root_objectid = 0;
- int ret;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, root_objectid,
- ARRAY_SIZE(gang))) != 0) {
- int i;
+ unsigned long index = 0;
+ int grabbed = 0;
+ struct btrfs_root *roots[8];
- for (i = 0; i < ret; i++)
- gang[i] = btrfs_grab_root(gang[i]);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
+ while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
+ ULONG_MAX, 8, XA_PRESENT))) {
+ for (int i = 0; i < grabbed; i++)
+ roots[i] = btrfs_grab_root(roots[i]);
+ spin_unlock(&fs_info->fs_roots_lock);
- for (i = 0; i < ret; i++) {
- if (!gang[i])
+ for (int i = 0; i < grabbed; i++) {
+ if (!roots[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
- btrfs_free_log(NULL, gang[i]);
- btrfs_put_root(gang[i]);
+ index = roots[i]->root_key.objectid;
+ btrfs_free_log(NULL, roots[i]);
+ btrfs_put_root(roots[i]);
}
- root_objectid++;
- spin_lock(&fs_info->fs_roots_radix_lock);
+ index++;
+ spin_lock(&fs_info->fs_roots_lock);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e10514ecda8..4ee8c42c9f78 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -120,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key);
+int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
+ int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6aa92f84f465..0867c5cd6e01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -895,7 +895,13 @@ again:
err = -ENOENT;
while (1) {
if (ptr >= end) {
- WARN_ON(ptr > end);
+ if (ptr > end) {
+ err = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ }
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
@@ -1239,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (size) {
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += size;
else if (ret != -EOPNOTSUPP)
@@ -1256,7 +1262,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (bytes_left) {
ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += bytes_left;
}
@@ -1291,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
&discarded);
discarded += src_disc;
- } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
} else {
ret = 0;
@@ -1577,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
- int metadata = !extent_op->is_data;
+ int metadata = 1;
if (TRANS_ABORTED(trans))
return 0;
- if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@@ -2180,7 +2186,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
- int level, int is_data)
+ int level)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2192,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
@@ -2357,15 +2362,10 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr, bool strict)
+ u64 bytenr, bool strict, struct btrfs_path *path)
{
- struct btrfs_path *path;
int ret;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
@@ -2376,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
} while (ret == -EAGAIN);
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
@@ -2497,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
return ret;
}
-static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group *cache;
- u64 bytenr;
-
- spin_lock(&fs_info->block_group_cache_lock);
- bytenr = fs_info->first_logical_byte;
- spin_unlock(&fs_info->block_group_cache_lock);
-
- if (bytenr < (u64)-1)
- return bytenr;
+ struct rb_node *leftmost;
+ u64 bytenr = 0;
- cache = btrfs_lookup_first_block_group(fs_info, search_start);
- if (!cache)
- return 0;
+ read_lock(&fs_info->block_group_cache_lock);
+ /* Get the block group with the lowest logical start address. */
+ leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+ if (leftmost) {
+ struct btrfs_block_group *bg;
- bytenr = cache->start;
- btrfs_put_block_group(cache);
+ bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+ bytenr = bg->start;
+ }
+ read_unlock(&fs_info->block_group_cache_lock);
return bytenr;
}
@@ -3803,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
/* Check RO and no space case before trying to activate it */
spin_lock(&block_group->lock);
- if (block_group->ro ||
- block_group->alloc_offset == block_group->zone_capacity) {
+ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
ret = 1;
/*
* May need to clear fs_info->{treelog,data_reloc}_bg.
@@ -4272,7 +4268,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
return ret;
ffe_ctl->search_start = max(ffe_ctl->search_start,
- first_logical_byte(fs_info, 0));
+ first_logical_byte(fs_info));
ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
@@ -4959,7 +4955,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
- extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@@ -5144,7 +5139,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
- btrfs_header_level(eb), 0);
+ btrfs_header_level(eb));
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -5818,7 +5813,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
@@ -5987,7 +5982,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*trimmed = 0;
/* Discard not supported = nothing to do. */
- if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ if (!bdev_max_discard_sectors(device->bdev))
return 0;
/* Not writable = nothing to do. */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 33c19f51d79b..8f6b544ae616 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
+#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -28,6 +29,7 @@
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
+#include "compression.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -75,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
if (!fs_info->allocated_ebs.next)
return;
+ WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
@@ -135,6 +138,17 @@ struct tree_entry {
struct rb_node rb_node;
};
+/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ enum btrfs_compression_type compress_type;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+};
+
struct extent_page_data {
struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
@@ -164,24 +178,27 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
return ret;
}
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+static void submit_one_bio(struct bio *bio, int mirror_num,
+ enum btrfs_compression_type compress_type)
{
- blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
+
if (is_data_inode(tree->private_data))
- ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- bio_flags);
+ btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ compress_type);
else
- ret = btrfs_submit_metadata_bio(tree->private_data, bio,
- mirror_num, bio_flags);
-
- return blk_status_to_errno(ret);
+ btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+ /*
+ * Above submission hooks will handle the error by ending the bio,
+ * which will do the cleanup properly. So here we should not return
+ * any error, or the caller of submit_extent_page() will do cleanup
+ * again, causing problems.
+ */
}
/* Cleanup unsubmitted bios */
@@ -202,13 +219,12 @@ static void end_write_bio(struct extent_page_data *epd, int ret)
* Return 0 if everything is OK.
* Return <0 for error.
*/
-static int __must_check flush_write_bio(struct extent_page_data *epd)
+static void flush_write_bio(struct extent_page_data *epd)
{
- int ret = 0;
struct bio *bio = epd->bio_ctrl.bio;
if (bio) {
- ret = submit_one_bio(bio, 0, 0);
+ submit_one_bio(bio, 0, 0);
/*
* Clean up of epd->bio is handled by its endio function.
* And endio is either triggered by successful bio execution
@@ -218,7 +234,6 @@ static int __must_check flush_write_bio(struct extent_page_data *epd)
*/
epd->bio_ctrl.bio = NULL;
}
- return ret;
}
int __init extent_state_cache_init(void)
@@ -2303,12 +2318,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
- struct bio *bio;
struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
u64 map_length = 0;
u64 sector;
struct btrfs_io_context *bioc = NULL;
- int ret;
+ int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
@@ -2316,8 +2332,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
- bio = btrfs_bio_alloc(1);
- bio->bi_iter.bi_size = 0;
map_length = length;
/*
@@ -2335,52 +2349,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bioc, 0);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bioc, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
BUG_ON(mirror_num != bioc->mirror_num);
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- bio->bi_iter.bi_sector = sector;
dev = bioc->stripes[bioc->mirror_num - 1].dev;
btrfs_put_bioc(bioc);
+
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ ret = -EIO;
+ goto out_counter_dec;
}
- bio_set_dev(bio, dev->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- bio_add_page(bio, page, length, pg_offset);
- if (btrfsic_submit_bio_wait(bio)) {
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- return -EIO;
+ goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start,
rcu_str_deref(dev->name), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return 0;
+ return ret;
}
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
@@ -2527,7 +2539,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
failrec->start = start;
failrec->len = sectorsize;
failrec->this_mirror = 0;
- failrec->bio_flags = 0;
+ failrec->compress_type = BTRFS_COMPRESS_NONE;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -2551,8 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ failrec->compress_type = em->compress_type;
}
btrfs_debug(fs_info,
@@ -2684,7 +2695,7 @@ int btrfs_repair_one_sector(struct inode *inode,
* will be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type);
return BLK_STS_OK;
}
@@ -2710,18 +2721,19 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_page_set_error(fs_info, page, start, len);
}
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
unlock_page(page);
else
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static blk_status_t submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- unsigned int error_bitmap,
- submit_bio_hook_t *submit_bio_hook)
+static blk_status_t submit_data_read_repair(struct inode *inode,
+ struct bio *failed_bio,
+ u32 bio_offset, struct page *page,
+ unsigned int pgoff,
+ u64 start, u64 end,
+ int failed_mirror,
+ unsigned int error_bitmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
@@ -2731,6 +2743,9 @@ static blk_status_t submit_read_repair(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ /* This repair is only for data */
+ ASSERT(is_data_inode(inode));
+
/* We're here because we had some read errors or csum mismatch */
ASSERT(error_bitmap);
@@ -2759,7 +2774,7 @@ static blk_status_t submit_read_repair(struct inode *inode,
ret = btrfs_repair_one_sector(inode, failed_bio,
bio_offset + offset,
page, pgoff + offset, start + offset,
- failed_mirror, submit_bio_hook);
+ failed_mirror, btrfs_submit_data_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2943,7 +2958,7 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
ASSERT(PageLocked(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page));
@@ -2951,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
}
/*
- * Find extent buffer for a givne bytenr.
+ * Find extent buffer for a given bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
@@ -2965,16 +2980,14 @@ static struct extent_buffer *find_extent_buffer_readpage(
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
- /* For subpage case, we need to lookup buffer radix tree */
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- bytenr >> fs_info->sectorsize_bits);
- rcu_read_unlock();
+ /* For subpage case, we need to lookup extent buffer xarray */
+ eb = xa_load(&fs_info->extent_buffers,
+ bytenr >> fs_info->sectorsize_bits);
ASSERT(eb);
return eb;
}
@@ -3077,13 +3090,13 @@ static void end_bio_extent_readpage(struct bio *bio)
goto readpage_ok;
/*
- * btrfs_submit_read_repair() will handle all the good
+ * submit_data_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
- submit_read_repair(inode, bio, bio_offset, page,
- start - page_offset(page), start,
- end, mirror, error_bitmap,
- btrfs_submit_data_bio);
+ submit_data_read_repair(inode, bio, bio_offset, page,
+ start - page_offset(page),
+ start, end, mirror,
+ error_bitmap);
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
@@ -3132,6 +3145,42 @@ readpage_ok:
bio_put(bio);
}
+/**
+ * Populate every free slot in a provided array with pages.
+ *
+ * @nr_pages: number of pages to allocate
+ * @page_array: the array to fill with pages; any existing non-null entries in
+ * the array will be skipped
+ *
+ * Return: 0 if all pages were able to be allocated;
+ * -ENOMEM otherwise, and the caller is responsible for freeing all
+ * non-null page pointers in the array.
+ */
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+{
+ unsigned int allocated;
+
+ for (allocated = 0; allocated < nr_pages;) {
+ unsigned int last = allocated;
+
+ allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+
+ if (allocated == nr_pages)
+ return 0;
+
+ /*
+ * During this iteration, no page could be allocated, even
+ * though alloc_pages_bulk_array() falls back to alloc_page()
+ * if it could not bulk-allocate. So we must be out of memory.
+ */
+ if (allocated == last)
+ return -ENOMEM;
+
+ memalloc_retry_wait(GFP_NOFS);
+ }
+ return 0;
+}
+
/*
* Initialize the members up to but not including 'bio'. Use after allocating a
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
@@ -3157,13 +3206,13 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone(struct bio *bio)
+struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
{
struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
+ new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(new);
btrfs_bio_init(bbio);
bbio->iter = bio->bi_iter;
@@ -3198,7 +3247,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @pg_offset: starting offset in the page
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compression type of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
@@ -3210,7 +3259,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
@@ -3222,10 +3271,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
ASSERT(bio);
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
- if (bio_ctrl->bio_flags != bio_flags)
+ if (bio_ctrl->compress_type != compress_type)
return 0;
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
@@ -3268,7 +3317,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
* The split happens for real compressed bio, which happens in
* btrfs_submit_compressed_read/write().
*/
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
bio_ctrl->len_to_stripe_boundary = U32_MAX;
return 0;
@@ -3311,7 +3360,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
unsigned int opf,
bio_end_io_t end_io_func,
u64 disk_bytenr, u32 offset, u64 file_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
@@ -3322,12 +3371,12 @@ static int alloc_new_bio(struct btrfs_inode *inode,
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
- bio_ctrl->bio_flags = bio_flags;
+ bio_ctrl->compress_type = compress_type;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
bio->bi_opf = opf;
@@ -3386,7 +3435,7 @@ error:
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compress type for current bio
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
@@ -3395,7 +3444,7 @@ static int submit_extent_page(unsigned int opf,
size_t size, unsigned long pg_offset,
bio_end_io_t end_io_func,
int mirror_num,
- unsigned long bio_flags,
+ enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
int ret = 0;
@@ -3407,10 +3456,8 @@ static int submit_extent_page(unsigned int opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
if (force_bio_submit && bio_ctrl->bio) {
- ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+ submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
}
while (cur < pg_offset + size) {
@@ -3422,7 +3469,7 @@ static int submit_extent_page(unsigned int opf,
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
end_io_func, disk_bytenr, offset,
page_offset(page) + cur,
- bio_flags);
+ compress_type);
if (ret < 0)
return ret;
}
@@ -3430,14 +3477,14 @@ static int submit_extent_page(unsigned int opf,
* We must go through btrfs_bio_add_page() to ensure each
* page range won't cross various boundaries.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
size - offset, pg_offset + offset,
- bio_flags);
+ compress_type);
else
added = btrfs_bio_add_page(bio_ctrl, page,
disk_bytenr + offset, size - offset,
- pg_offset + offset, bio_flags);
+ pg_offset + offset, compress_type);
/* Metadata page range should never be split */
if (!is_data_inode(&inode->vfs_inode))
@@ -3451,11 +3498,8 @@ static int submit_extent_page(unsigned int opf,
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
- ret = submit_one_bio(bio_ctrl->bio, mirror_num,
- bio_ctrl->bio_flags);
+ submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
}
cur += added;
}
@@ -3478,7 +3522,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
if (!PagePrivate(page))
attach_page_private(page, eb);
else
@@ -3513,7 +3557,7 @@ int set_page_extent_mapped(struct page *page)
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
@@ -3530,7 +3574,7 @@ void clear_page_extent_mapped(struct page *page)
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_detach_subpage(fs_info, page);
detach_page_private(page);
@@ -3569,7 +3613,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start)
{
@@ -3638,16 +3682,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- this_bio_flag |= EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&this_bio_flag,
- em->compress_type);
- }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED)
+ if (this_bio_flag != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
else
disk_bytenr = em->block_start + extent_offset;
@@ -3743,8 +3784,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
this_bio_flag,
force_bio_submit);
if (ret) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ /*
+ * We have to unlock the remaining range, or the page
+ * will never be unlocked.
+ */
+ unlock_extent(tree, cur, end);
+ end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
cur = cur + iosize;
@@ -3754,6 +3799,27 @@ out:
return ret;
}
+int btrfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ /*
+ * If btrfs_do_readpage() failed we will want to submit the assembled
+ * bio to do the cleanup.
+ */
+ if (bio_ctrl.bio)
+ submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ return ret;
+}
+
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
@@ -3772,12 +3838,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
}
}
-static void update_nr_written(struct writeback_control *wbc,
- unsigned long nr_written)
-{
- wbc->nr_to_write -= nr_written;
-}
-
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
@@ -3877,7 +3937,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!btrfs_is_subpage(fs_info, page)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -3920,10 +3980,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
u64 extent_offset;
u64 block_start;
struct extent_map *em;
+ int saved_ret = 0;
int ret = 0;
int nr = 0;
u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
+ bool has_error = false;
bool compressed;
ret = btrfs_writepage_cow_fixup(page);
@@ -3938,7 +4000,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
while (cur <= end) {
u64 disk_bytenr;
@@ -3973,6 +4035,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
break;
}
@@ -4036,6 +4101,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
end_bio_extent_writepage,
0, 0, false);
if (ret) {
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+
btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
btrfs_page_clear_writeback(fs_info, page, cur,
@@ -4049,8 +4118,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* If we finish without problem, we should not only clear page dirty,
* but also empty subpage dirty bits
*/
- if (!ret)
+ if (!has_error)
btrfs_page_assert_not_dirty(fs_info, page);
+ else
+ ret = saved_ret;
*nr_ret = nr;
return ret;
}
@@ -4181,9 +4252,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
- if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
- btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
-
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4203,14 +4271,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages, failed_page_nr;
+ int i, num_pages;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ flush_write_bio(epd);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -4220,9 +4286,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ flush_write_bio(epd);
flush = 1;
}
while (1) {
@@ -4260,7 +4324,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
* Subpage metadata doesn't use page locking at all, so we can skip
* the page locking.
*/
- if (!ret || fs_info->sectorsize < PAGE_SIZE)
+ if (!ret || fs_info->nodesize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -4269,14 +4333,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- int err;
-
- err = flush_write_bio(epd);
- if (err < 0) {
- ret = err;
- failed_page_nr = i;
- goto err_unlock;
- }
+ flush_write_bio(epd);
flush = 1;
}
lock_page(p);
@@ -4284,25 +4341,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
}
return ret;
-err_unlock:
- /* Unlock already locked pages */
- for (i = 0; i < failed_page_nr; i++)
- unlock_page(eb->pages[i]);
- /*
- * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
- * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
- * be made and undo everything done before.
- */
- btrfs_tree_lock(eb);
- spin_lock(&eb->refs_lock);
- set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- end_extent_buffer_writeback(eb);
- spin_unlock(&eb->refs_lock);
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
- fs_info->dirty_metadata_batch);
- btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- btrfs_tree_unlock(eb);
- return ret;
}
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
@@ -4397,8 +4435,8 @@ static struct extent_buffer *find_extent_buffer_nolock(
struct extent_buffer *eb;
rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
+ eb = xa_load(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
@@ -4420,7 +4458,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->nodesize < PAGE_SIZE);
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -4572,7 +4610,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
return ret;
}
@@ -4608,7 +4646,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
break;
}
disk_bytenr += PAGE_SIZE;
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
unlock_page(p);
}
@@ -4747,7 +4785,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc, epd);
spin_lock(&mapping->private_lock);
@@ -4803,8 +4841,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
/*
* Implies write in zoned mode. Mark the last eb in a block group.
*/
- if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
- set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
ret = write_one_eb(eb, wbc, epd);
@@ -4923,13 +4960,19 @@ retry:
* if the fs already has error.
*/
if (!BTRFS_FS_ERROR(fs_info)) {
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
} else {
ret = -EROFS;
end_write_bio(&epd, ret);
}
out:
btrfs_zoned_meta_io_unlock(fs_info);
+ /*
+ * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * were submitted. Reset it to 0 to avoid false alerts for the caller.
+ */
+ if (ret > 0)
+ ret = 0;
return ret;
}
@@ -5031,8 +5074,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
+ flush_write_bio(epd);
lock_page(page);
}
@@ -5042,10 +5084,8 @@ retry:
}
if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
- }
+ if (PageWriteback(page))
+ flush_write_bio(epd);
wait_on_page_writeback(page);
}
@@ -5085,9 +5125,8 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- ret = flush_write_bio(epd);
- if (!ret)
- goto retry;
+ flush_write_bio(epd);
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -5113,8 +5152,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
- ret = flush_write_bio(&epd);
- ASSERT(ret <= 0);
+ flush_write_bio(&epd);
return ret;
}
@@ -5176,7 +5214,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
}
if (!found_error)
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
@@ -5209,7 +5247,7 @@ int extent_writepages(struct address_space *mapping,
end_write_bio(&epd, ret);
return ret;
}
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
@@ -5232,10 +5270,8 @@ void extent_readahead(struct readahead_control *rac)
if (em_cached)
free_extent_map(em_cached);
- if (bio_ctrl.bio) {
- if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
- return;
- }
+ if (bio_ctrl.bio)
+ submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
}
/*
@@ -5271,7 +5307,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
}
/*
- * a helper for releasepage, this tests for areas of the page that
+ * a helper for release_folio, this tests for areas of the page that
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
@@ -5307,7 +5343,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
}
/*
- * a helper for releasepage. As long as there are no locked extents
+ * a helper for release_folio. As long as there are no locked extents
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
@@ -5804,7 +5840,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
return;
}
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -5911,9 +5947,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
- struct page *p;
struct extent_buffer *new;
int num_pages = num_extent_pages(src);
+ int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
@@ -5926,22 +5962,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ memset(new->pages, 0, sizeof(*new->pages) * num_pages);
+ ret = btrfs_alloc_page_array(num_pages, new->pages);
+ if (ret) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+
for (i = 0; i < num_pages; i++) {
int ret;
+ struct page *p = new->pages[i];
- p = alloc_page(GFP_NOFS);
- if (!p) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
ret = attach_extent_buffer_page(new, p, NULL);
if (ret < 0) {
- put_page(p);
btrfs_release_extent_buffer(new);
return NULL;
}
WARN_ON(PageDirty(p));
- new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
set_extent_buffer_uptodate(new);
@@ -5955,31 +5992,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int num_pages;
int i;
+ int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
num_pages = num_extent_pages(eb);
+ ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ if (ret)
+ goto err;
+
for (i = 0; i < num_pages; i++) {
- int ret;
+ struct page *p = eb->pages[i];
- eb->pages[i] = alloc_page(GFP_NOFS);
- if (!eb->pages[i])
- goto err;
- ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ ret = attach_extent_buffer_page(eb, p, NULL);
if (ret < 0)
goto err;
}
+
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
err:
- for (; i > 0; i--) {
- detach_extent_buffer_page(eb, eb->pages[i - 1]);
- __free_page(eb->pages[i - 1]);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i]) {
+ detach_extent_buffer_page(eb, eb->pages[i]);
+ __free_page(eb->pages[i]);
+ }
}
__free_extent_buffer(eb);
return NULL;
@@ -6001,10 +6043,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
- * calling releasepage when the tree reference is the only reference.
+ * calling release_folio when the tree reference is the only reference.
*
* In both cases, care is taken to ensure that the extent_buffer's
- * pages are not under io. However, releasepage can be concurrently
+ * pages are not under io. However, release_folio can be concurrently
* called with creating new references, which is prone to race
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
@@ -6086,24 +6128,22 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
-again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
+
+ do {
+ ret = xa_insert(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits,
+ eb, GFP_NOFS);
+ if (ret == -ENOMEM) {
+ exists = ERR_PTR(ret);
goto free_eb;
- else
- goto again;
- }
+ }
+ if (ret == -EBUSY) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ }
+ } while (ret);
+
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6124,7 +6164,7 @@ static struct extent_buffer *grab_extent_buffer(
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return NULL;
/* Page not yet attached to an extent buffer */
@@ -6146,6 +6186,30 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
}
+static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+{
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return -EINVAL;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE &&
+ offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ if (fs_info->nodesize >= PAGE_SIZE &&
+ !IS_ALIGNED(start, PAGE_SIZE)) {
+ btrfs_err(fs_info,
+ "tree block is not page aligned, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
@@ -6160,10 +6224,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int uptodate = 1;
int ret;
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
- btrfs_err(fs_info, "bad tree block start %llu", start);
+ if (check_eb_alignment(fs_info, start))
return ERR_PTR(-EINVAL);
- }
#if BITS_PER_LONG == 32
if (start >= MAX_LFS_FILESIZE) {
@@ -6176,14 +6238,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_warn_32bit_limit(fs_info);
#endif
- if (fs_info->sectorsize < PAGE_SIZE &&
- offset_in_page(start) + len > PAGE_SIZE) {
- btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %lu",
- start, len);
- return ERR_PTR(-EINVAL);
- }
-
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -6213,7 +6267,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
@@ -6257,39 +6311,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
/*
* We can't unlock the pages just yet since the extent buffer
* hasn't been properly inserted in the radix tree, this
- * opens a race with btree_releasepage which can free a page
+ * opens a race with btree_release_folio which can free a page
* while we are still filling in all pages for the buffer and
* we could crash.
*/
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
-
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
+
+ do {
+ ret = xa_insert(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits,
+ eb, GFP_NOFS);
+ if (ret == -ENOMEM) {
+ exists = ERR_PTR(ret);
goto free_eb;
- else
- goto again;
- }
+ }
+ if (ret == -EBUSY) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ }
+ } while (ret);
+
/* add one reference for the tree */
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* Now it's safe to unlock the pages because any calls to
- * btree_releasepage will correctly detect that a page belongs to a
+ * btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
for (i = 0; i < num_pages; i++)
@@ -6327,10 +6378,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
- spin_lock(&fs_info->buffer_lock);
- radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> fs_info->sectorsize_bits);
- spin_unlock(&fs_info->buffer_lock);
+ xa_erase(&fs_info->extent_buffers,
+ eb->start >> fs_info->sectorsize_bits);
} else {
spin_unlock(&eb->refs_lock);
}
@@ -6432,7 +6481,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb)
int num_pages;
struct page *page;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
num_pages = num_extent_pages(eb);
@@ -6464,7 +6513,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
+ bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
/*
* For subpage case, we can have other extent buffers in the
@@ -6504,9 +6553,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- if (page)
- btrfs_page_clear_uptodate(fs_info, page,
- eb->start, eb->len);
+ if (!page)
+ continue;
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ ClearPageUptodate(page);
+ else
+ btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6521,7 +6579,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ SetPageUptodate(page);
+ else
+ btrfs_subpage_set_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6577,12 +6644,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
atomic_dec(&eb->io_pages);
}
if (bio_ctrl.bio) {
- int tmp;
-
- tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
+ submit_one_bio(bio_ctrl.bio, mirror_num, 0);
bio_ctrl.bio = NULL;
- if (tmp < 0)
- return tmp;
}
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6616,7 +6679,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
return -EIO;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
num_pages = num_extent_pages(eb);
@@ -6659,7 +6722,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
/*
- * It is possible for releasepage to clear the TREE_REF bit before we
+ * It is possible for release_folio to clear the TREE_REF bit before we
* set io_pages. See check_buffer_tree_ref for a more detailed comment.
*/
check_buffer_tree_ref(eb);
@@ -6695,10 +6758,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
if (bio_ctrl.bio) {
- err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
+ submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type);
bio_ctrl.bio = NULL;
- if (err)
- return err;
}
if (ret || wait != WAIT_COMPLETE)
@@ -6871,7 +6932,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
@@ -6973,7 +7034,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ if (dst->fs_info->nodesize >= PAGE_SIZE) {
num_pages = num_extent_pages(dst);
for (i = 0; i < num_pages; i++)
copy_page(page_address(dst->pages[i]),
@@ -6982,7 +7043,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
size_t src_offset = get_eb_offset_in_page(src, 0);
size_t dst_offset = get_eb_offset_in_page(dst, 0);
- ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(src->fs_info->nodesize < PAGE_SIZE);
memcpy(page_address(dst->pages[0]) + dst_offset,
page_address(src->pages[0]) + src_offset,
src->len);
@@ -7263,42 +7324,25 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
-#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[GANG_LOOKUP_SIZE];
- struct extent_buffer *found = NULL;
+ struct extent_buffer *eb;
+ unsigned long index;
u64 page_start = page_offset(page);
- u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- while (cur < page_start + PAGE_SIZE) {
- int ret;
- int i;
-
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
- (void **)gang, cur >> fs_info->sectorsize_bits,
- min_t(unsigned int, GANG_LOOKUP_SIZE,
- PAGE_SIZE / fs_info->nodesize));
- if (ret == 0)
- goto out;
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- goto out;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- goto out;
- }
- }
- cur = gang[ret - 1]->start + gang[ret - 1]->len;
+ xa_for_each_start(&fs_info->extent_buffers, index, eb,
+ page_start >> fs_info->sectorsize_bits) {
+ if (in_range(eb->start, page_start, PAGE_SIZE))
+ return eb;
+ else if (eb->start >= page_start + PAGE_SIZE)
+ /* Already beyond page end */
+ return NULL;
}
-out:
- return found;
+ return NULL;
}
static int try_release_subpage_extent_buffer(struct page *page)
@@ -7375,7 +7419,7 @@ int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 151e9da5da2d..23d4103c8831 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,15 +7,9 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include "compression.h"
#include "ulist.h"
-/*
- * flags for bio submission. The high bits indicate the compression
- * type for this bio
- */
-#define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_FLAG_SHIFT 16
-
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@@ -32,7 +26,6 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
- EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -71,9 +64,9 @@ struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
-typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
- unsigned long bio_flags);
+ enum btrfs_compression_type compress_type);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
@@ -103,17 +96,6 @@ struct extent_buffer {
};
/*
- * Structure to record info about the bio being assembled, and other info like
- * how many bytes are there before stripe/ordered extent boundary.
- */
-struct btrfs_bio_ctrl {
- struct bio *bio;
- unsigned long bio_flags;
- u32 len_to_stripe_boundary;
- u32 len_to_oe_boundary;
-};
-
-/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-static inline void extent_set_compress_type(unsigned long *bio_flags,
- int compress_type)
-{
- *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
-}
-
-static inline int extent_compress_type(unsigned long bio_flags)
-{
- return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
-}
-
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
@@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags);
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- unsigned int read_flags, u64 *prev_em_start);
+int btrfs_read_folio(struct file *file, struct folio *folio);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
@@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
+
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio);
+struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
@@ -297,7 +266,7 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
- unsigned long bio_flags;
+ enum btrfs_compression_type compress_type;
int this_mirror;
int failed_mirror;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 380054c94e4b..1fd827b99c1b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1307,11 +1307,12 @@ static int prepare_uptodate_page(struct inode *inode,
struct page *page, u64 pos,
bool force_uptodate)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, folio);
if (ret)
return ret;
lock_page(page);
@@ -1321,8 +1322,8 @@ static int prepare_uptodate_page(struct inode *inode,
}
/*
- * Since btrfs_readpage() will unlock the page before it
- * returns, there is a window where btrfs_releasepage() can be
+ * Since btrfs_read_folio() will unlock the folio before it
+ * returns, there is a window where btrfs_release_folio() can be
* called to release the page. Here we check both inode
* mapping and PagePrivate() to make sure the page was not
* released.
@@ -1460,8 +1461,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes, bool nowait)
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset.
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * > 0 If we can nocow, and updates @write_bytes.
+ * 0 If we can't do a nocow write.
+ * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * root is in progress.
+ * < 0 If an error happened.
+ *
+ * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1472,7 +1492,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
- if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
@@ -1480,71 +1500,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
- if (nowait) {
- struct btrfs_ordered_extent *ordered;
-
- if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
- return -EAGAIN;
-
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- num_bytes);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- goto out_unlock;
- }
- } else {
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
- }
-
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
- if (!nowait)
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
-static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, true);
-}
-
-/*
- * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
- *
- * @pos: File offset
- * @write_bytes: The length to write, will be updated to the nocow writeable
- * range
- *
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks.
- *
- * Return:
- * >0 and update @write_bytes if we can do nocow write
- * 0 if we can't do nocow write
- * -EAGAIN if we can't get the needed lock or there are ordered extents
- * for * (nowait == true) case
- * <0 if other error happened
- *
- * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
- */
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, false);
-}
-
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
@@ -1579,20 +1549,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
loff_t oldsize;
loff_t start_pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- size_t nocow_bytes = count;
-
- /* We will allocate space in case nodatacow is not set, so bail */
- if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
- return -EAGAIN;
- /*
- * There are holes in the range or parts of the range that must
- * be COWed (shared extents, RO block groups, etc), so just bail
- * out.
- */
- if (nocow_bytes < count)
- return -EAGAIN;
- }
+ /*
+ * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
+ * prealloc flags, as without those flags we always have to COW. We will
+ * later check if we can really COW into the target range (using
+ * can_nocow_extent() at btrfs_get_blocks_direct_write()).
+ */
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return -EAGAIN;
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
@@ -1720,7 +1685,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
- reserve_bytes);
+ reserve_bytes, false);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1965,8 +1930,7 @@ relock:
*/
again:
from->nofault = true;
- err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, written);
+ err = btrfs_dio_rw(iocb, from, written);
from->nofault = false;
/* No increment (+=) because iomap returns a cumulative value. */
@@ -2401,7 +2365,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct address_space *mapping = filp->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
@@ -2570,10 +2534,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return ret;
}
-static int btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
{
/*
* For subpage case, if the range is not at page boundary, we could
@@ -2587,40 +2551,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
while (1) {
- struct btrfs_ordered_extent *ordered;
- int ret;
-
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- lockend);
-
/*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
+ * We can't have ordered extents in the range, nor dirty/writeback
+ * pages, because we have locked the inode's VFS lock in exclusive
+ * mode, we have locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range and we have waited
+ * for any ordered extents in the range to complete.
+ * We can race with anyone reading pages from this range, so after
+ * locking the range check if we have pages in the range, and if
+ * we do, unlock the range and retry.
*/
- if ((!ordered ||
- (ordered->file_offset + ordered->num_bytes <= lockstart ||
- ordered->file_offset > lockend)) &&
- !filemap_range_has_page(inode->i_mapping,
- page_lockstart, page_lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+ if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
+ page_lockend))
break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, cached_state);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret)
- return ret;
}
- return 0;
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
@@ -2976,11 +2929,12 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
- return ret;
+ goto out_only_mutex;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@@ -3072,10 +3026,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
goto out_only_mutex;
}
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out_only_mutex;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
path = btrfs_alloc_path();
if (!path) {
@@ -3237,8 +3188,6 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- inode_dio_wait(inode);
-
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
@@ -3368,10 +3317,8 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
@@ -3417,6 +3364,9 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
+ u64 data_space_needed = 0;
+ u64 data_space_reserved = 0;
+ u64 qgroup_reserved = 0;
struct extent_map *em;
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
@@ -3437,18 +3387,6 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
- /*
- * Only trigger disk allocation, don't trigger qgroup reserve
- *
- * For qgroup space, it will be checked later.
- */
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
- }
-
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
@@ -3485,8 +3423,12 @@ static long btrfs_fallocate(struct file *file, int mode,
}
/*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
+ * We have locked the inode at the VFS level (in exclusive mode) and we
+ * have locked the i_mmap_lock lock (in exclusive mode). Now before
+ * locking the file range, flush all dealloc in the range and wait for
+ * all ordered extents in the range to complete. After this we can lock
+ * the file range and, due to the previous locking we did, we know there
+ * can't be more delalloc or ordered extents in the range.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
@@ -3500,38 +3442,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- locked_end);
-
- if (ordered &&
- ordered->file_offset + ordered->num_bytes > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- ret = btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- if (ret)
- goto out;
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
@@ -3548,48 +3462,64 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = add_falloc_range(&reserve_list, cur_offset,
- last_byte - cur_offset);
+ const u64 range_len = last_byte - cur_offset;
+
+ ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
- &data_reserved, cur_offset,
- last_byte - cur_offset);
+ &data_reserved, cur_offset, range_len);
if (ret < 0) {
- cur_offset = last_byte;
free_extent_map(em);
break;
}
- } else {
- /*
- * Do not need to reserve unwritten extent for this
- * range, free reserved data space first, otherwise
- * it'll result in false ENOSPC error.
- */
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, cur_offset,
- last_byte - cur_offset);
+ qgroup_reserved += range_len;
+ data_space_needed += range_len;
}
free_extent_map(em);
cur_offset = last_byte;
}
+ if (!ret && data_space_needed > 0) {
+ /*
+ * We are safe to reserve space here as we can't have delalloc
+ * in the range, see above.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ data_space_needed);
+ if (!ret)
+ data_space_reserved = data_space_needed;
+ }
+
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
- if (!ret)
+ if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
- else
+ /*
+ * btrfs_prealloc_file_range() releases space even
+ * if it returns an error.
+ */
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (data_space_reserved > 0) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, range->start,
- range->len);
+ data_reserved, range->start,
+ range->len);
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+ range->start, range->len);
+ qgroup_reserved -= range->len;
+ }
list_del(&range->list);
kfree(range);
}
@@ -3606,10 +3536,6 @@ out_unlock:
&cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- /* Let go of our reservation. */
- if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
- cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
@@ -3767,8 +3693,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, read);
+ ret = btrfs_dio_rw(iocb, to, read);
to->nofault = false;
pagefault_enable();
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01a408db5683..b1ae3ba2ca2c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -465,7 +465,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
@@ -2630,16 +2630,19 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *sinfo = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ int bg_reclaim_threshold = 0;
bool initial = (size == block_group->length);
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
@@ -4069,7 +4072,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "cleaning free space cache v1");
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 0ae54d8c10d6..1bf89aa67216 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto abort;
}
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8a43ecfb9f..81737eff92f3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,8 +64,36 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
ssize_t submitted;
struct extent_changeset *data_reserved;
+ bool data_space_reserved;
+ bool nocow_done;
};
+struct btrfs_dio_private {
+ struct inode *inode;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
+ /* Used for bio::bi_size */
+ u32 bytes;
+
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
+
+ /* Array of checksums */
+ u8 *csums;
+
+ /* This must be last */
+ struct bio bio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -222,15 +250,25 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+ struct btrfs_new_inode_args *args)
{
int err;
- err = btrfs_init_acl(trans, inode, dir);
- if (!err)
- err = btrfs_xattr_security_init(trans, inode, dir, qstr);
- return err;
+ if (args->default_acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ACL_TYPE_DEFAULT);
+ if (err)
+ return err;
+ }
+ if (args->acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (err)
+ return err;
+ }
+ if (!args->default_acl && !args->acl)
+ cache_no_acl(args->inode);
+ return btrfs_xattr_security_init(trans, args->inode, args->dir,
+ &args->dentry->d_name);
}
/*
@@ -481,17 +519,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
}
/*
- * Check if the inode has flags compatible with compression
- */
-static inline bool inode_can_compress(struct btrfs_inode *inode)
-{
- if (inode->flags & BTRFS_INODE_NODATACOW ||
- inode->flags & BTRFS_INODE_NODATASUM)
- return false;
- return true;
-}
-
-/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
@@ -500,7 +527,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!inode_can_compress(inode)) {
+ if (!btrfs_inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
btrfs_ino(inode));
@@ -1618,6 +1645,141 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
nr_written, 1);
}
+struct can_nocow_file_extent_args {
+ /* Input fields. */
+
+ /* Start file offset of the range we want to NOCOW. */
+ u64 start;
+ /* End file offset (inclusive) of the range we want to NOCOW. */
+ u64 end;
+ bool writeback_path;
+ bool strict;
+ /*
+ * Free the path passed to can_nocow_file_extent() once it's not needed
+ * anymore.
+ */
+ bool free_path;
+
+ /* Output fields. Only set when can_nocow_file_extent() returns 1. */
+
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ /* Number of bytes that can be written to in NOCOW mode. */
+ u64 num_bytes;
+};
+
+/*
+ * Check if we can NOCOW the file extent that the path points to.
+ * This function may return with the path released, so the caller should check
+ * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
+ *
+ * Returns: < 0 on error
+ * 0 if we can not NOCOW
+ * 1 if we can NOCOW
+ */
+static int can_nocow_file_extent(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ struct can_nocow_file_extent_args *args)
+{
+ const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 extent_type;
+ int can_nocow = 0;
+ int ret = 0;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ goto out;
+
+ /* Can't access these fields unless we know it's not an inline extent. */
+ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ extent_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ /*
+ * If the extent was created before the generation where the last snapshot
+ * for its subvolume was created, then this implies the extent is shared,
+ * hence we must COW.
+ */
+ if (!args->strict &&
+ btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ /* An explicit hole, must COW. */
+ if (args->disk_bytenr == 0)
+ goto out;
+
+ /* Compressed/encrypted/encoded extents must be COWed. */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The following checks can be expensive, as they need to take other
+ * locks and do btree or rbtree searches, so release the path to avoid
+ * blocking other tasks for too long.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
+ key->offset - args->extent_offset,
+ args->disk_bytenr, false, path);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ if (args->free_path) {
+ /*
+ * We don't need the path anymore, plus through the
+ * csum_exist_in_range() call below we will end up allocating
+ * another path. So free the path to avoid unnecessary extra
+ * memory usage.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+ }
+
+ /* If there are pending snapshots for this root, we must COW. */
+ if (args->writeback_path && !is_freespace_inode &&
+ atomic_read(&root->snapshot_force_cow))
+ goto out;
+
+ args->disk_bytenr += args->extent_offset;
+ args->disk_bytenr += args->start - key->offset;
+ args->num_bytes = min(args->end + 1, extent_end) - args->start;
+
+ /*
+ * Force COW if csums exist in the range. This ensures that csums for a
+ * given extent are either valid or do not exist.
+ */
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ can_nocow = 1;
+ out:
+ if (args->free_path && path)
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : can_nocow;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -1638,11 +1800,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(inode);
u64 ino = btrfs_ino(inode);
+ struct btrfs_block_group *bg;
bool nocow = false;
- u64 disk_bytenr = 0;
- const bool force = inode->flags & BTRFS_INODE_NODATACOW;
+ struct can_nocow_file_extent_args nocow_args = { 0 };
path = btrfs_alloc_path();
if (!path) {
@@ -1655,15 +1816,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
return -ENOMEM;
}
+ nocow_args.end = end;
+ nocow_args.writeback_path = true;
+
while (1) {
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
u64 extent_end;
- u64 extent_offset;
- u64 num_bytes = 0;
- u64 disk_num_bytes;
u64 ram_bytes;
+ u64 nocow_end;
int extent_type;
nocow = false;
@@ -1739,116 +1901,38 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
-
+ /* If this is triggered then we have a memory corruption. */
+ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
+ if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
+ ret = -EUCLEAN;
+ goto error;
+ }
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- extent_offset = btrfs_file_extent_offset(leaf, fi);
- extent_end = found_key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- disk_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf, fi);
- /*
- * If the extent we got ends before our current offset,
- * skip to the next extent.
- */
- if (extent_end <= cur_offset) {
- path->slots[0]++;
- goto next_slot;
- }
- /* Skip holes */
- if (disk_bytenr == 0)
- goto out_check;
- /* Skip compressed/encrypted/encoded extents */
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out_check;
- /*
- * If extent is created before the last volume's snapshot
- * this implies the extent is shared, hence we can't do
- * nocow. This is the same check as in
- * btrfs_cross_ref_exist but without calling
- * btrfs_search_slot.
- */
- if (!freespace_inode &&
- btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
- goto out_check;
- if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
- goto out_check;
+ extent_end = btrfs_file_extent_end(path);
- /*
- * The following checks can be expensive, as they need to
- * take other locks and do btree or rbtree searches, so
- * release the path to avoid blocking other tasks for too
- * long.
- */
- btrfs_release_path(path);
+ /*
+ * If the extent we got ends before our current offset, skip to
+ * the next extent.
+ */
+ if (extent_end <= cur_offset) {
+ path->slots[0]++;
+ goto next_slot;
+ }
- ret = btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr, false);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
+ nocow_args.start = cur_offset;
+ ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ } else if (ret == 0) {
+ goto out_check;
+ }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- disk_bytenr += extent_offset;
- disk_bytenr += cur_offset - found_key.offset;
- num_bytes = min(end + 1, extent_end) - cur_offset;
- /*
- * If there are pending snapshots for this root, we
- * fall into common COW way
- */
- if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
- goto out_check;
- /*
- * force cow if csum exists in the range.
- * this ensure that csum for a given extent are
- * either valid or do not exist.
- */
- ret = csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- /* If the extent's block group is RO, we must COW */
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
- goto out_check;
+ ret = 0;
+ bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (bg)
nocow = true;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = found_key.offset + ram_bytes;
- extent_end = ALIGN(extent_end, fs_info->sectorsize);
- /* Skip extents outside of our requested range */
- if (extent_end <= start) {
- path->slots[0]++;
- goto next_slot;
- }
- } else {
- /* If this triggers then we have a memory corruption */
- BUG();
- }
out_check:
/*
* If nocow is false then record the beginning of the range
@@ -1880,15 +1964,17 @@ out_check:
cow_start = (u64)-1;
}
+ nocow_end = cur_offset + nocow_args.num_bytes - 1;
+
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 orig_start = found_key.offset - extent_offset;
+ u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, num_bytes,
+ em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
orig_start,
- disk_bytenr, /* block_start */
- num_bytes, /* block_len */
- disk_num_bytes, /* orig_block_len */
+ nocow_args.disk_bytenr, /* block_start */
+ nocow_args.num_bytes, /* block_len */
+ nocow_args.disk_num_bytes, /* orig_block_len */
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
@@ -1897,20 +1983,23 @@ out_check:
}
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode,
- cur_offset, num_bytes, num_bytes,
- disk_bytenr, num_bytes, 0,
+ cur_offset, nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes, 0,
1 << BTRFS_ORDERED_PREALLOC,
BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + num_bytes - 1,
- 0);
+ nocow_end, 0);
goto error;
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
- num_bytes, num_bytes,
- disk_bytenr, num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes,
0,
1 << BTRFS_ORDERED_NOCOW,
BTRFS_COMPRESS_NONE);
@@ -1918,9 +2007,10 @@ out_check:
goto error;
}
- if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- nocow = false;
+ if (nocow) {
+ btrfs_dec_nocow_writers(bg);
+ nocow = false;
+ }
if (btrfs_is_data_reloc_root(root))
/*
@@ -1929,10 +2019,9 @@ out_check:
* from freeing metadata of created ordered extent.
*/
ret = btrfs_reloc_clone_csums(inode, cur_offset,
- num_bytes);
+ nocow_args.num_bytes);
- extent_clear_unlock_delalloc(inode, cur_offset,
- cur_offset + num_bytes - 1,
+ extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
@@ -1965,7 +2054,7 @@ out_check:
error:
if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+ btrfs_dec_nocow_writers(bg);
if (ret && cur_offset < end)
extent_clear_unlock_delalloc(inode, cur_offset, end,
@@ -2019,7 +2108,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
- } else if (!inode_can_compress(inode) ||
+ } else if (!btrfs_inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
ret = run_delalloc_zoned(inode, locked_page, start, end,
@@ -2509,9 +2598,8 @@ out:
*
* c-3) otherwise: async submit
*/
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
-
+void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2540,16 +2628,14 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (ret)
goto out;
- if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
/*
* btrfs_submit_compressed_read will handle completing
* the bio if there were any errors, so just return
* here.
*/
- ret = btrfs_submit_compressed_read(inode, bio,
- mirror_num,
- bio_flags);
- goto out_no_endio;
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2566,7 +2652,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
@@ -2583,8 +2669,6 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
-out_no_endio:
- return ret;
}
/*
@@ -3275,11 +3359,11 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
shash->tfm = fs_info->csum_shash;
crypto_shash_digest(shash, kaddr + pgoff, len, csum);
+ kunmap_atomic(kaddr);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
- kunmap_atomic(kaddr);
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
@@ -3287,9 +3371,7 @@ zeroit:
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memset(kaddr + pgoff, 1, len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ memzero_page(page, pgoff, len);
return -EIO;
}
@@ -3494,6 +3576,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
+ /* Bail out if the cleanup is already running. */
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
@@ -3576,17 +3659,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
- * fs_info->fs_roots_radix. So here we can find if an
+ * fs_info->fs_roots. So here we can find if an
* orphan item corresponds to a deleted root by looking
- * up the root from that radix tree.
+ * up the root from that xarray.
*/
- spin_lock(&fs_info->fs_roots_radix_lock);
- dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)found_key.objectid);
+ spin_lock(&fs_info->fs_roots_lock);
+ dead_root = xa_load(&fs_info->fs_roots,
+ (unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
@@ -3826,7 +3909,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4210,8 +4293,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the dir index
* 1 for the inode ref
* 1 for the inode
+ * 1 for the parent inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 6);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4704,7 +4788,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -4725,7 +4809,7 @@ again:
goto out_unlock;
if (!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
@@ -5791,8 +5875,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct list_head ins_list;
struct list_head del_list;
int ret;
- struct extent_buffer *leaf;
- int slot;
char *name_ptr;
int name_len;
int entries = 0;
@@ -5819,35 +5901,19 @@ again:
key.offset = ctx->pos;
key.objectid = btrfs_ino(BTRFS_I(inode));
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ struct extent_buffer *leaf = path->nodes[0];
if (found_key.objectid != key.objectid)
break;
if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
- goto next;
+ continue;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
- goto next;
- di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ continue;
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -5874,9 +5940,11 @@ again:
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
-next:
- path->slots[0]++;
}
+ /* Catch error encountered during iteration */
+ if (ret < 0)
+ goto err;
+
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
@@ -6064,6 +6132,57 @@ static int btrfs_insert_inode_locked(struct inode *inode)
btrfs_find_actor, &args);
}
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ int ret;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
+ if (ret)
+ return ret;
+
+ /* 1 to add inode item */
+ *trans_num_items = 1;
+ /* 1 to add compression property */
+ if (BTRFS_I(dir)->prop_compress)
+ (*trans_num_items)++;
+ /* 1 to add default ACL xattr */
+ if (args->default_acl)
+ (*trans_num_items)++;
+ /* 1 to add access ACL xattr */
+ if (args->acl)
+ (*trans_num_items)++;
+#ifdef CONFIG_SECURITY
+ /* 1 to add LSM xattr */
+ if (dir->i_security)
+ (*trans_num_items)++;
+#endif
+ if (args->orphan) {
+ /* 1 to add orphan item */
+ (*trans_num_items)++;
+ } else {
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ *
+ * No need for 1 unit for the inode ref item because it is
+ * inserted in a batch together with the inode item at
+ * btrfs_create_new_inode().
+ */
+ *trans_num_items += 3;
+ }
+ return 0;
+}
+
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
+{
+ posix_acl_release(args->acl);
+ posix_acl_release(args->default_acl);
+}
+
/*
* Inherit flags from the parent inode.
*
@@ -6073,9 +6192,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
unsigned int flags;
- if (!dir)
- return;
-
flags = BTRFS_I(dir)->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
@@ -6095,76 +6211,86 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
btrfs_sync_inode_flags_to_i_flags(inode);
}
-static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- const char *name, int name_len,
- u64 ref_objectid, u64 objectid,
- umode_t mode, u64 *index)
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ const char *name = args->orphan ? NULL : args->dentry->d_name.name;
+ int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
struct btrfs_path *path;
+ u64 objectid;
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
struct btrfs_item_batch batch;
unsigned long ptr;
- unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
-
- nofs_flag = memalloc_nofs_save();
- inode = new_inode(fs_info->sb);
- memalloc_nofs_restore(nofs_flag);
- if (!inode) {
- btrfs_free_path(path);
- return ERR_PTR(-ENOMEM);
- }
+ return -ENOMEM;
- /*
- * O_TMPFILE, set link count to 0, so that after this point,
- * we fill in an inode item with the correct link count.
- */
- if (!name)
- set_nlink(inode, 0);
+ if (!args->subvol)
+ BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
+ root = BTRFS_I(inode)->root;
- /*
- * we have to initialize this early, so we can reclaim the inode
- * number if we fail afterwards in this function.
- */
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
+ goto out;
inode->i_ino = objectid;
- if (dir && name) {
+ if (args->orphan) {
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point, we
+ * fill in an inode item with the correct link count.
+ */
+ set_nlink(inode, 0);
+ } else {
trace_btrfs_inode_request(dir);
- ret = btrfs_set_inode_index(BTRFS_I(dir), index);
- if (ret) {
- btrfs_free_path(path);
- iput(inode);
- return ERR_PTR(ret);
- }
- } else if (dir) {
- *index = 0;
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
+ if (ret)
+ goto out;
}
- /*
- * index_cnt is ignored for everything but a dir,
- * btrfs_set_inode_index_count has an explanation for the magic
- * number
- */
- BTRFS_I(inode)->index_cnt = 2;
- BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = btrfs_grab_root(root);
+ /* index_cnt is ignored for everything but a dir. */
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * Subvolumes don't inherit flags from their parent directory.
+ * Originally this was probably by accident, but we probably can't
+ * change it now without compatibility issues.
+ */
+ if (!args->subvol)
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(inode->i_mode)) {
+ if (btrfs_test_opt(fs_info, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(fs_info, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0) {
+ if (!args->orphan)
+ BTRFS_I(dir)->index_cnt--;
+ goto out;
+ }
+
+ /*
* We could have gotten an inode number from somebody who was fsynced
* and then removed in this same transaction, so let's just set full
* sync since it will be a full sync anyway and this will blow away the
@@ -6178,7 +6304,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
- if (name) {
+ if (!args->orphan) {
/*
* Start new inodes with an inode_ref. This is slightly more
* efficient for small numbers of hard links since they will
@@ -6187,64 +6313,95 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY;
- key[1].offset = ref_objectid;
-
- sizes[1] = name_len + sizeof(*ref);
- }
-
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
- ret = btrfs_insert_inode_locked(inode);
- if (ret < 0) {
- iput(inode);
- goto fail;
+ if (args->subvol) {
+ key[1].offset = objectid;
+ sizes[1] = 2 + sizeof(*ref);
+ } else {
+ key[1].offset = btrfs_ino(BTRFS_I(dir));
+ sizes[1] = name_len + sizeof(*ref);
+ }
}
batch.keys = &key[0];
batch.data_sizes = &sizes[0];
- batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
- batch.nr = name ? 2 : 1;
+ batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
+ batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0)
- goto fail_unlock;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
- inode_set_bytes(inode, 0);
+ if (ret != 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
+ /*
+ * We're going to fill the inode item now, so at this point the inode
+ * must be fully initialized.
+ */
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- if (name) {
+ if (!args->orphan) {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (args->subvol) {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
+ write_extent_buffer(path->nodes[0], "..", ptr, 2);
+ } else {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref,
+ BTRFS_I(inode)->dir_index);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
+ btrfs_release_path(path);
- btrfs_inherit_iflags(inode, dir);
+ if (args->subvol) {
+ struct inode *parent;
- if (S_ISREG(mode)) {
- if (btrfs_test_opt(fs_info, NODATASUM))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(fs_info, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ /*
+ * Subvolumes inherit properties from their parent subvolume,
+ * not the directory they were created in.
+ */
+ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_I(dir)->root);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, parent);
+ iput(parent);
+ }
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ }
+ if (ret) {
+ btrfs_err(fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
+ ret);
+ }
+
+ /*
+ * Subvolumes don't inherit ACLs or get passed to the LSM. This is
+ * probably a bug.
+ */
+ if (!args->subvol) {
+ ret = btrfs_init_inode_security(trans, args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
inode_tree_add(inode);
@@ -6254,21 +6411,30 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_update_root_times(trans, root);
- ret = btrfs_inode_inherit_props(trans, inode, dir);
- if (ret)
- btrfs_err(fs_info,
- "error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
+ if (args->orphan) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ } else {
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ name_len, 0, BTRFS_I(inode)->dir_index);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
- return inode;
+ ret = 0;
+ goto out;
-fail_unlock:
+discard:
+ /*
+ * discard_new_inode() calls iput(), but the caller owns the reference
+ * to the inode.
+ */
+ ihold(inode);
discard_new_inode(inode);
-fail:
- if (dir && name)
- BTRFS_I(dir)->index_cnt--;
+out:
btrfs_free_path(path);
- return ERR_PTR(ret);
+ return ret;
}
/*
@@ -6360,147 +6526,71 @@ fail_dir_item:
return ret;
}
-static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct dentry *dentry,
- struct btrfs_inode *inode, int backref, u64 index)
-{
- int err = btrfs_add_link(trans, dir, inode,
- dentry->d_name.name, dentry->d_name.len,
- backref, index);
- if (err > 0)
- err = -EEXIST;
- return err;
-}
-
-static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
+static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .inode = inode,
+ };
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
int err;
- u64 objectid;
- u64 index = 0;
-
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- err = btrfs_get_free_objectid(root, &objectid);
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- btrfs_update_inode(trans, root, BTRFS_I(inode));
- d_instantiate_new(dentry, inode);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!err)
+ d_instantiate_new(dentry, inode);
-out_unlock:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
-static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- int err;
- u64 objectid;
- u64 index = 0;
+ struct inode *inode;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ return btrfs_create_common(dir, dentry, inode);
+}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_unlock;
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
- }
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- d_instantiate_new(dentry, inode);
-
-out_unlock:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6546,8 +6636,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 1, index);
+ err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ dentry->d_name.name, dentry->d_name.len, 1, index);
if (err) {
drop_inode = 1;
@@ -6584,66 +6674,15 @@ fail:
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- int err = 0;
- u64 objectid = 0;
- u64 index = 0;
-
- /*
- * 2 items for inode and ref
- * 2 items for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_fail;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFDIR | mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_fail;
- }
+ struct inode *inode;
- /* these must be set before we unlock the inode */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_fail;
-
- btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_fail;
-
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name,
- dentry->d_name.len, 0, index);
- if (err)
- goto out_fail;
-
- d_instantiate_new(dentry, inode);
-
-out_fail:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -7152,6 +7191,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
@@ -7159,13 +7199,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 disk_bytenr;
- u64 backref_offset;
- u64 extent_end;
- u64 num_bytes;
- int slot;
int found_type;
- bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
@@ -7176,18 +7210,17 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
if (ret < 0)
goto out;
- slot = path->slots[0];
if (ret == 1) {
- if (slot == 0) {
+ if (path->slots[0] == 0) {
/* can't find the item, must cow */
ret = 0;
goto out;
}
- slot--;
+ path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
@@ -7199,55 +7232,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
goto out;
}
- fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, fi);
- if (found_type != BTRFS_FILE_EXTENT_REG &&
- found_type != BTRFS_FILE_EXTENT_PREALLOC) {
- /* not a regular extent, must cow */
- goto out;
- }
-
- if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ if (btrfs_file_extent_end(path) <= offset)
goto out;
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end <= offset)
- goto out;
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (ram_bytes)
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (disk_bytenr == 0)
- goto out;
+ nocow_args.start = offset;
+ nocow_args.end = offset + *len - 1;
+ nocow_args.strict = strict;
+ nocow_args.free_path = true;
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out;
+ ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ /* can_nocow_file_extent() has freed the path. */
+ path = NULL;
- /*
- * Do the same check as in btrfs_cross_ref_exist but without the
- * unnecessary search.
- */
- if (!strict &&
- (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item)))
+ if (ret != 1) {
+ /* Treat errors as not being able to NOCOW. */
+ ret = 0;
goto out;
-
- backref_offset = btrfs_file_extent_offset(leaf, fi);
-
- if (orig_start) {
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
+ ret = 0;
+ if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
goto out;
- num_bytes = min(offset + *len, extent_end) - offset;
- if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + num_bytes,
+ range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
@@ -7257,36 +7273,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- btrfs_release_path(path);
-
- /*
- * look for other files referencing this extent, if we
- * find any we must cow
- */
-
- ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr,
- strict);
- if (ret) {
- ret = 0;
- goto out;
- }
+ if (orig_start)
+ *orig_start = key.offset - nocow_args.extent_offset;
+ if (orig_block_len)
+ *orig_block_len = nocow_args.disk_num_bytes;
- /*
- * adjust disk_bytenr and num_bytes to cover just the bytes
- * in this extent we are about to write. If there
- * are any csums in that range we have to cow in order
- * to keep the csums correct
- */
- disk_bytenr += backref_offset;
- disk_bytenr += offset - key.offset;
- if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
- goto out;
- /*
- * all of the above have passed, it is safe to overwrite this extent
- * without cow
- */
- *len = num_bytes;
+ *len = nocow_args.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
@@ -7294,14 +7286,22 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, bool writing)
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ lock_extent_bits(io_tree, lockstart, lockend, cached_state);
+ }
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure there's no ordered
@@ -7322,10 +7322,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
/*
* If we are doing a DIO read and the ordered extent we
* found is for a buffered write, we can not wait for it
@@ -7345,7 +7349,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
btrfs_start_ordered_extent(ordered, 1);
else
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
/*
@@ -7361,7 +7365,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* ordered extent to complete while holding a lock on
* that page.
*/
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
}
if (ret)
@@ -7435,12 +7439,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct inode *inode,
struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
+ u64 start, u64 len,
+ unsigned int iomap_flags)
{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
+ struct btrfs_block_group *bg;
bool can_nocow = false;
bool space_reserved = false;
u64 prev_len;
@@ -7466,9 +7473,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start))
- can_nocow = true;
+ &orig_block_len, &ram_bytes, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
}
prev_len = len;
@@ -7476,12 +7485,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
goto out;
}
space_reserved = true;
@@ -7490,7 +7502,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
*map = em = em2;
@@ -7500,15 +7512,29 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
ret = PTR_ERR(em2);
goto out;
}
+
+ dio_data->nocow_done = true;
} else {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- /* We have to COW, so need to reserve metadata and data space. */
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, len);
+ if (nowait)
+ return -EAGAIN;
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved)
+ return -ENOSPC;
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
if (ret < 0)
goto out;
space_reserved = true;
@@ -7521,10 +7547,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
*map = em;
len = min(len, em->len - (start - em->start));
if (len < prev_len)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start + len, prev_len - len,
- true);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
}
/*
@@ -7542,15 +7566,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
out:
if (ret && space_reserved) {
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- if (can_nocow) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, len, true);
- extent_changeset_free(dio_data->data_reserved);
- dio_data->data_reserved = NULL;
- }
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
}
return ret;
}
@@ -7559,14 +7575,16 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
loff_t length, unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
u64 lockstart, lockend;
const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
u64 len = length;
+ const u64 data_alloc_len = length;
bool unlock_extents = false;
if (!write)
@@ -7576,34 +7594,67 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
lockend = start + len - 1;
/*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so we
- * need to flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
}
- dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
- if (!dio_data)
- return -ENOMEM;
-
- iomap->private = dio_data;
+ memset(dio_data, 0, sizeof(*dio_data));
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
/*
* If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered.
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
- ret = -ENOTBLK;
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
goto err;
- }
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
@@ -7663,12 +7714,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, len);
+ start, len, flags);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
} else {
/*
* We need to unlock only the end area that we aren't using.
@@ -7713,7 +7782,12 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- kfree(dio_data);
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
return ret;
}
@@ -7721,15 +7795,16 @@ err:
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
- int ret = 0;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
size_t submitted = dio_data->submitted;
const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
- goto out;
+ return 0;
}
if (submitted < length) {
@@ -7746,10 +7821,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (write)
extent_changeset_free(dio_data->data_reserved);
-out:
- kfree(dio_data);
- iomap->private = NULL;
-
return ret;
}
@@ -7762,40 +7833,36 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (!refcount_dec_and_test(&dip->refs))
return;
- if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
+ if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->file_offset,
dip->bytes,
- !dip->dio_bio->bi_status);
+ !dip->bio.bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
dip->file_offset + dip->bytes - 1);
}
- bio_endio(dip->dio_bio);
- kfree(dip);
+ kfree(dip->csums);
+ bio_endio(&dip->bio);
}
-static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
+ if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
+ return;
refcount_inc(&dip->refs);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret)
+ if (btrfs_map_bio(fs_info, bio, mirror_num))
refcount_dec(&dip->refs);
- return ret;
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7880,7 +7947,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
- dip->dio_bio->bi_status = err;
+ dip->bio.bi_status = err;
btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
@@ -7910,7 +7977,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
+ ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
btrfs_submit_bio_start_direct_io);
goto err;
} else if (write) {
@@ -7935,50 +8002,16 @@ err:
return ret;
}
-/*
- * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
- * or ordered extents whether or not we submit any bios.
- */
-static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
- struct inode *inode,
- loff_t file_offset)
-{
- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- size_t dip_size;
- struct btrfs_dio_private *dip;
-
- dip_size = sizeof(*dip);
- if (!write && csum) {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- size_t nblocks;
-
- nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
- dip_size += fs_info->csum_size * nblocks;
- }
-
- dip = kzalloc(dip_size, GFP_NOFS);
- if (!dip)
- return NULL;
-
- dip->inode = inode;
- dip->file_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
- dip->dio_bio = dio_bio;
- refcount_set(&dip->refs, 1);
- return dip;
-}
-
static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct btrfs_dio_private *dip =
+ container_of(dio_bio, struct btrfs_dio_private, bio);
struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
- struct btrfs_dio_private *dip;
struct bio *bio;
u64 start_sector;
int async_submit = 0;
@@ -7989,27 +8022,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iter->iomap.private;
+ struct btrfs_dio_data *dio_data = iter->private;
struct extent_map *em = NULL;
- dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
- if (!dip) {
- if (!write) {
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
- }
- dio_bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(dio_bio);
- return;
- }
+ dip->inode = inode;
+ dip->file_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ refcount_set(&dip->refs, 1);
+ dip->csums = NULL;
+
+ if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ unsigned int nr_sectors =
+ (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
- *
- * If we have csums disabled this will do nothing.
*/
+ status = BLK_STS_RESOURCE;
+ dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+ if (!dip)
+ goto out_err;
+
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
@@ -8099,19 +8133,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
out_err_em:
free_extent_map(em);
out_err:
- dip->dio_bio->bi_status = status;
+ dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
}
-const struct iomap_ops btrfs_dio_iomap_ops = {
+static const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
-const struct iomap_dio_ops btrfs_dio_ops = {
+static const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
+ .bio_set = &btrfs_dio_bioset,
};
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -8124,27 +8167,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
-int btrfs_readpage(struct file *file, struct page *page)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio) {
- int ret2;
-
- ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
@@ -8182,7 +8204,7 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidate_folio() we have a race window where
+ * For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
@@ -8193,7 +8215,7 @@ static void wait_subpage_spinlock(struct page *page)
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -8214,22 +8236,22 @@ static void wait_subpage_spinlock(struct page *page)
spin_unlock_irq(&subpage->lock);
}
-static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- int ret = try_release_extent_mapping(page, gfp_flags);
+ int ret = try_release_extent_mapping(&folio->page, gfp_flags);
if (ret == 1) {
- wait_subpage_spinlock(page);
- clear_page_extent_mapped(page);
+ wait_subpage_spinlock(&folio->page);
+ clear_page_extent_mapped(&folio->page);
}
return ret;
}
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
- return __btrfs_releasepage(page, gfp_flags);
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
+ return __btrfs_release_folio(folio, gfp_flags);
}
#ifdef CONFIG_MIGRATION
@@ -8300,7 +8322,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* still safe to wait for ordered extent to finish.
*/
if (!(offset == 0 && length == folio_size(folio))) {
- btrfs_releasepage(&folio->page, GFP_NOFS);
+ btrfs_release_folio(folio, GFP_NOFS);
return;
}
@@ -8424,7 +8446,7 @@ next:
ASSERT(!folio_test_ordered(folio));
btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(&folio->page, GFP_NOFS);
+ __btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
}
@@ -8775,46 +8797,23 @@ out:
return ret;
}
-/*
- * create a new subvolume directory/inode (helper for the ioctl).
- */
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns)
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
struct inode *inode;
- int err;
- u64 index = 0;
- u64 ino;
-
- err = btrfs_get_free_objectid(new_root, &ino);
- if (err < 0)
- return err;
-
- inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
- ino, ino,
- S_IFDIR | (~current_umask() & S_IRWXUGO),
- &index);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
-
- set_nlink(inode, 1);
- btrfs_i_size_write(BTRFS_I(inode), 0);
- unlock_new_inode(inode);
-
- err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
- if (err)
- btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d",
- new_root->root_key.objectid, err);
-
- err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
- iput(inode);
- return err;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ /*
+ * Subvolumes don't inherit the sgid bit or the parent's gid if
+ * the parent's sgid bit is set. This is probably a bug.
+ */
+ inode_init_owner(mnt_userns, inode, NULL,
+ S_IFDIR | (~current_umask() & S_IRWXUGO));
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ }
+ return inode;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -8954,7 +8953,7 @@ int btrfs_drop_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+ struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -8966,6 +8965,7 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
+ bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
@@ -9006,6 +9006,11 @@ int __init btrfs_init_cachep(void)
if (!btrfs_free_space_bitmap_cachep)
goto fail;
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bio),
+ BIOSET_NEED_BVECS))
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -9061,6 +9066,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
@@ -9092,14 +9098,37 @@ static int btrfs_rename_exchange(struct inode *old_dir,
down_read(&fs_info->subvol_sem);
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
- * should cover the worst case number of items we'll modify.
+ * For each inode:
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ * 1 to update parent inode
+ *
+ * If the parents are the same, we only need to account for one
*/
- trans = btrfs_start_transaction(root, 12);
+ trans_num_items = (old_dir == new_dir ? 9 : 10);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode item
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ trans_num_items += 4;
+ else
+ trans_num_items += 3;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
@@ -9266,56 +9295,19 @@ out_notrans:
return ret;
}
-static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- struct dentry *dentry)
+static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
- int ret;
struct inode *inode;
- u64 objectid;
- u64 index;
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- return ret;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name,
- dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)),
- objectid,
- S_IFCHR | WHITEOUT_MODE,
- &index);
-
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- return ret;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ inode_init_owner(mnt_userns, inode, dir,
+ S_IFCHR | WHITEOUT_MODE);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
}
-
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- WHITEOUT_DEV);
-
- ret = btrfs_init_inode_security(trans, inode, dir,
- &dentry->d_name);
- if (ret)
- goto out;
-
- ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-out:
- unlock_new_inode(inode);
- if (ret)
- inode_dec_link_count(inode);
- iput(inode);
-
- return ret;
+ return inode;
}
static int btrfs_rename(struct user_namespace *mnt_userns,
@@ -9324,6 +9316,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_new_inode_args whiteout_args = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ };
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -9378,23 +9374,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
- /* close the racy window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (flags & RENAME_WHITEOUT) {
+ whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
+ if (!whiteout_args.inode)
+ return -ENOMEM;
+ ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
+ if (ret)
+ goto out_whiteout_inode;
+ } else {
+ /* 1 to update the old parent inode. */
+ trans_num_items = 1;
+ }
+
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* Close the race window with snapshot create/destroy ioctl */
down_read(&fs_info->subvol_sem);
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume they are normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- * If our rename has the whiteout flag, we need more 5 units for the
- * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
- * when selinux is enabled).
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
*/
- trans_num_items = 11;
- if (flags & RENAME_WHITEOUT)
+ trans_num_items += 4;
+ /* 1 to update new parent inode if it's not the same as the old parent */
+ if (new_dir != old_dir)
+ trans_num_items++;
+ if (new_inode) {
+ /*
+ * 1 to update inode
+ * 1 to remove inode ref
+ * 1 to remove dir item
+ * 1 to remove dir index
+ * 1 to possibly add orphan item
+ */
trans_num_items += 5;
+ }
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -9490,12 +9519,14 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
- old_dir, old_dentry);
-
+ ret = btrfs_create_new_inode(trans, &whiteout_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
+ } else {
+ unlock_new_inode(whiteout_args.inode);
+ iput(whiteout_args.inode);
+ whiteout_args.inode = NULL;
}
}
out_fail:
@@ -9504,7 +9535,11 @@ out_fail:
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
-
+ if (flags & RENAME_WHITEOUT)
+ btrfs_new_inode_args_destroy(&whiteout_args);
+out_whiteout_inode:
+ if (flags & RENAME_WHITEOUT)
+ iput(whiteout_args.inode);
return ret;
}
@@ -9723,10 +9758,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ };
+ unsigned int trans_num_items;
int err;
- u64 objectid;
- u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
@@ -9737,49 +9775,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
- /*
- * 2 items for inode item and ref
- * 2 items for dir items
- * 1 item for updating parent inode item
- * 1 item for the inline extent item
- * 1 item for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 7);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ btrfs_i_size_write(BTRFS_I(inode), name_len);
+ inode_set_bytes(inode, name_len);
- err = btrfs_get_free_objectid(root, &objectid);
+ new_inode_args.inode = inode;
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
+ /* 1 additional item for the inline extent */
+ trans_num_items++;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFLNK | S_IRWXUGO, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
- inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
if (err)
- goto out_unlock;
+ goto out;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- goto out_unlock;
+ btrfs_abort_transaction(trans, err);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = 0;
@@ -9788,8 +9817,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
+ btrfs_abort_transaction(trans, err);
btrfs_free_path(path);
- goto out_unlock;
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -9807,31 +9839,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode_set_bytes(inode, name_len);
- btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- /*
- * Last step, add directory indexes for our symlink inode. This is the
- * last step to avoid extra cleanup of these indexes if an error happens
- * elsewhere above.
- */
- if (!err)
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (err)
- goto out_unlock;
-
d_instantiate_new(dentry, inode);
-
-out_unlock:
+ err = 0;
+out:
btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
@@ -10082,62 +10099,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- u64 objectid;
- u64 index;
- int ret = 0;
-
- /*
- * 5 units required for adding orphan entry
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- goto out;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- inode = NULL;
- goto out;
- }
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .orphan = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
-
inode->i_mapping->a_ops = &btrfs_aops;
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ new_inode_args.inode = inode;
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
- goto out;
+ goto out_inode;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (ret)
- goto out;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret)
- goto out;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
/*
- * We set number of links to 0 in btrfs_new_inode(), and here we set
- * it to 1 because d_tmpfile() will issue a warning if the count is 0,
- * through:
+ * We set number of links to 0 in btrfs_create_new_inode(), and here we
+ * set it to 1 because d_tmpfile() will issue a warning if the count is
+ * 0, through:
*
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- mark_inode_dirty(inode);
-out:
+
+ if (!ret) {
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ mark_inode_dirty(inode);
+ }
+
btrfs_end_transaction(trans);
- if (ret && inode)
- discard_new_inode(inode);
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (ret)
+ iput(inode);
return ret;
}
@@ -10469,13 +10482,11 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- ret = -ENOMEM;
- goto out;
+ ret = btrfs_alloc_page_array(nr_pages, pages);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
- }
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
disk_io_size, pages);
@@ -10811,7 +10822,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
if (ret)
goto out_free_data_space;
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+ false);
if (ret)
goto out_qgroup_free_data;
@@ -11320,6 +11332,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
spin_unlock(&inode->lock);
}
+/**
+ * Verify that there are no ordered extents for a given file range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the file range, should be sector size aligned.
+ * @end: End offset (inclusive) of the file range, its value +1 should be
+ * sector size aligned.
+ *
+ * This should typically be used for cases where we locked an inode's VFS lock in
+ * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range, we have waited for all ordered
+ * extents in the range to complete and finally we have locked the file range in
+ * the inode's io_tree.
+ */
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ordered_extent *ordered;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
+ if (ordered) {
+ btrfs_err(root->fs_info,
+"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
+ start, end, btrfs_ino(inode), root->root_key.objectid,
+ ordered->file_offset,
+ ordered->file_offset + ordered->num_bytes - 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ASSERT(ordered == NULL);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -11368,13 +11415,13 @@ static const struct file_operations btrfs_dir_file_operations = {
* For now we're avoiding this by dropping bmap.
*/
static const struct address_space_operations btrfs_aops = {
- .readpage = btrfs_readpage,
+ .read_folio = btrfs_read_folio,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
.invalidate_folio = btrfs_invalidate_folio,
- .releasepage = btrfs_releasepage,
+ .release_folio = btrfs_release_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be6c24577dbe..0f79af919bc4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_device *device;
- struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
@@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
- if (!device->bdev)
+ if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
continue;
- q = bdev_get_queue(device->bdev);
- if (blk_queue_discard(q)) {
- num_devices++;
- minlen = min_t(u64, q->limits.discard_granularity,
- minlen);
- }
+ num_devices++;
+ minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ minlen);
}
rcu_read_unlock();
@@ -544,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
static noinline int create_subvol(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
- const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -559,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
- struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
int ret;
- dev_t anon_dev = 0;
+ dev_t anon_dev;
u64 objectid;
- u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -571,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto fail_free;
-
- ret = get_anon_bdev(&anon_dev);
- if (ret < 0)
- goto fail_free;
+ goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@@ -583,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
- goto fail_free;
+ goto out_root_item;
+ }
+
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
}
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * The same as the snapshot creation, please see the comment
- * of create_snapshot().
- */
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
if (ret)
- goto fail_free;
+ goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv);
- goto fail_free;
+ goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
- goto fail;
+ goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
- goto fail;
+ goto out;
}
btrfs_mark_buffer_dirty(leaf);
@@ -667,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
- goto fail;
+ goto out;
}
free_extent_buffer(leaf);
leaf = NULL;
- key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
- free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- /* Freeing will be done in btrfs_put_root() of new_root */
+ /* anon_dev is owned by new_root now. */
anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
- btrfs_put_root(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
- btrfs_put_root(new_root);
- if (ret) {
- /* We potentially lose an unused inode item here */
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- /*
- * insert the directory item
- */
- ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
- BTRFS_FT_DIR, index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
- btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_uuid_tree_add(trans, root_item->uuid,
- BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
-fail:
- kfree(root_item);
+out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
@@ -744,18 +748,14 @@ fail:
btrfs_end_transaction(trans);
else
ret = btrfs_commit_transaction(trans);
-
- if (!ret) {
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- d_instantiate(dentry, inode);
- }
- return ret;
-
-fail_free:
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
+out_root_item:
kfree(root_item);
return ret;
}
@@ -767,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
@@ -804,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - parent dir inode
- * 2 - dir entries
- * 1 - root item
- * 2 - root ref/backref
- * 1 - root of snapshot
- * 1 - UUID item
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
*/
+ trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv, 8,
- false);
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
if (ret)
goto free_pending;
@@ -983,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -1359,7 +1358,7 @@ again:
* make it uptodate.
*/
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != mapping || !PagePrivate(page)) {
unlock_page(page);
@@ -1417,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (!em)
break;
- /* Skip hole/inline/preallocated extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
goto next;
@@ -1477,6 +1487,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->len >= get_extent_max_capacity(em))
goto next;
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
extent_thresh, newer_than, locked);
if (!next_mergeable) {
@@ -2565,7 +2584,12 @@ static noinline int search_ioctl(struct inode *inode,
while (1) {
ret = -EFAULT;
- if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
+ /*
+ * Ensure that the whole user buffer is faulted in at sub-page
+ * granularity, otherwise the loop may live-lock.
+ */
+ if (fault_in_subpage_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -2593,7 +2617,7 @@ err:
static noinline int btrfs_ioctl_tree_search(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
size_t buf_size;
@@ -2601,8 +2625,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- uargs = (struct btrfs_ioctl_search_args __user *)argp;
-
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
@@ -2625,7 +2647,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
size_t buf_size;
@@ -2635,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
return -EPERM;
/* copy search header and buffer size */
- uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@@ -4343,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
- if (!arg)
- btrfs_warn(fs_info,
- "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4354,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
+ }
+
again:
if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
@@ -4401,59 +4425,42 @@ again:
}
locked:
-
- if (arg) {
- bargs = memdup_user(arg, sizeof(*bargs));
- if (IS_ERR(bargs)) {
- ret = PTR_ERR(bargs);
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
goto out_unlock;
}
- if (bargs->flags & BTRFS_BALANCE_RESUME) {
- if (!fs_info->balance_ctl) {
- ret = -ENOTCONN;
- goto out_bargs;
- }
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
- bctl = fs_info->balance_ctl;
- spin_lock(&fs_info->balance_lock);
- bctl->flags |= BTRFS_BALANCE_RESUME;
- spin_unlock(&fs_info->balance_lock);
- btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+ goto do_balance;
+ }
- goto do_balance;
- }
- } else {
- bargs = NULL;
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
- goto out_bargs;
+ goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
- goto out_bargs;
- }
-
- if (arg) {
- memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
- memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
- memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
- bctl->flags = bargs->flags;
- } else {
- /* balance everything - no filters */
- bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ goto out_unlock;
}
- if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
- ret = -EINVAL;
- goto out_bctl;
- }
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+ bctl->flags = bargs->flags;
do_balance:
/*
* Ownership of bctl and exclusive operation goes to btrfs_balance.
@@ -4466,21 +4473,19 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if ((ret == 0 || ret == -ECANCELED) && arg) {
+ if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
-out_bctl:
kfree(bctl);
-out_bargs:
- kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
+ kfree(bargs);
return ret;
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1a6d2d5b4b33..a2ec8ecae8de 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
struct prop_handler {
struct hlist_node node;
const char *xattr_name;
- int (*validate)(const char *value, size_t len);
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -55,7 +57,8 @@ find_prop_handler(const char *name,
return NULL;
}
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
{
const struct prop_handler *handler;
@@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
if (value_len == 0)
return 0;
- return handler->validate(value, value_len);
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
}
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
return ret;
}
-static int prop_compression_validate(const char *value, size_t len)
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
if (!value)
return 0;
@@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return 0;
}
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
@@ -330,13 +375,13 @@ static struct prop_handler prop_handlers[] = {
.validate = prop_compression_validate,
.apply = prop_compression_apply,
.extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
.inheritable = 1
},
};
-static int inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *parent)
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -356,6 +401,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
value = h->extract(parent);
if (!value)
continue;
@@ -364,7 +412,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(value, strlen(value));
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
if (ret)
continue;
@@ -408,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *dir)
-{
- if (!dir)
- return 0;
-
- return inherit_props(trans, inode, dir);
-}
-
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root)
-{
- struct super_block *sb = root->fs_info->sb;
- struct inode *parent_inode, *child_inode;
- int ret;
-
- parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root);
- if (IS_ERR(parent_inode))
- return PTR_ERR(parent_inode);
-
- child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root);
- if (IS_ERR(child_inode)) {
- iput(parent_inode);
- return PTR_ERR(child_inode);
- }
-
- ret = inherit_props(trans, child_inode, parent_inode);
- iput(child_inode);
- iput(parent_inode);
-
- return ret;
-}
-
void __init btrfs_props_init(void)
{
int i;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 40b2c65b518c..ca9dd3df129b 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -13,7 +13,9 @@ void __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
@@ -21,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir);
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root);
-
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1866b1f0da01..db723c0026bd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2290,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return 0;
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
+ ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
- if (ret <= 0 && ret != -EDQUOT)
+ if ((ret <= 0 && ret != -EDQUOT) || noflush)
return ret;
ret = try_flush_qgroup(root);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 880e9df0dac1..0c4dd2a9af96 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce);
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
- int num_bytes, bool enforce)
+ int num_bytes, bool enforce,
+ bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0e239a4c3b26..a5b623ee6fac 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -52,6 +52,17 @@ struct btrfs_stripe_hash_table {
struct btrfs_stripe_hash table[];
};
+/*
+ * A bvec like structure to present a sector inside a page.
+ *
+ * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ */
+struct sector_ptr {
+ struct page *page;
+ unsigned int pgoff:24;
+ unsigned int uptodate:8;
+};
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -77,7 +88,7 @@ struct btrfs_raid_bio {
/*
* for scheduling work in the helper threads
*/
- struct btrfs_work work;
+ struct work_struct work;
/*
* bio list and bio_list_lock are used
@@ -101,15 +112,6 @@ struct btrfs_raid_bio {
*/
unsigned long flags;
- /* size of each individual stripe on disk */
- int stripe_len;
-
- /* number of data stripes (no p/q) */
- int nr_data;
-
- int real_stripes;
-
- int stripe_npages;
/*
* set if we're doing a parity rebuild
* for a read from higher up, which is handled
@@ -118,18 +120,35 @@ struct btrfs_raid_bio {
*/
enum btrfs_rbio_ops operation;
- /* first bad stripe */
- int faila;
+ /* Size of each individual stripe on disk */
+ u32 stripe_len;
- /* second bad stripe (for raid6 use) */
- int failb;
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
- int scrubp;
- /*
- * number of pages needed to represent the full
- * stripe
- */
- int nr_pages;
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
/*
* size of all the bios in the bio_list. This
@@ -156,28 +175,29 @@ struct btrfs_raid_bio {
*/
struct page **stripe_pages;
- /*
- * pointers to the pages in the bio_list. Stored
- * here for faster lookup
- */
- struct page **bio_pages;
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
/*
- * bitmap to record which horizontal stripe has data
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
*/
+ struct sector_ptr *stripe_sectors;
+
+ /* Bitmap to record which horizontal stripe has data */
unsigned long *dbitmap;
/* allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
- /* allocated with stripe_npages-many bits for finish_*() calls */
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
unsigned long *finish_pbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work *work);
-static void read_rebuild_work(struct btrfs_work *work);
+static void rmw_work(struct work_struct *work);
+static void read_rebuild_work(struct work_struct *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void __free_raid_bio(struct btrfs_raid_bio *rbio);
@@ -186,12 +206,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check);
-static void scrub_parity_work(struct btrfs_work *work);
+static void scrub_parity_work(struct work_struct *work);
-static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
+static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
- btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
+ INIT_WORK(&rbio->work, work_func);
+ queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -239,7 +259,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
/*
* caching an rbio means to copy anything from the
- * bio_pages array into the stripe_pages array. We
+ * bio_sectors array into the stripe_pages array. We
* use the page uptodate bit in the stripe cache array
* to indicate if it has valid data
*
@@ -255,12 +275,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
if (ret)
return;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (!rbio->bio_pages[i])
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ /* Some range not covered by bio (partial write), skip it */
+ if (!rbio->bio_sectors[i].page)
continue;
- copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
- SetPageUptodate(rbio->stripe_pages[i]);
+ ASSERT(rbio->stripe_sectors[i].page);
+ memcpy_page(rbio->stripe_sectors[i].page,
+ rbio->stripe_sectors[i].pgoff,
+ rbio->bio_sectors[i].page,
+ rbio->bio_sectors[i].pgoff,
+ rbio->bioc->fs_info->sectorsize);
+ rbio->stripe_sectors[i].uptodate = 1;
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -283,9 +309,50 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
+static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page;
+ i++) {
+ if (!rbio->stripe_sectors[i].uptodate)
+ return false;
+ }
+ return true;
+}
+
/*
- * stealing an rbio means taking all the uptodate pages from the stripe
- * array in the source rbio and putting them into the destination rbio
+ * Update the stripe_sectors[] array to use correct page and pgoff
+ *
+ * Should be called every time any page pointer in stripes_pages[] got modified.
+ */
+static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ u32 offset;
+ int i;
+
+ for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ int page_index = offset >> PAGE_SHIFT;
+
+ ASSERT(page_index < rbio->nr_pages);
+ rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
+ rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ }
+}
+
+/*
+ * Stealing an rbio means taking all the uptodate pages from the stripe array
+ * in the source rbio and putting them into the destination rbio.
+ *
+ * This will also update the involved stripe_sectors[] which are referring to
+ * the old pages.
*/
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
@@ -298,9 +365,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
- if (!s || !PageUptodate(s)) {
+ if (!s || !full_page_sectors_uptodate(src, i))
continue;
- }
d = dest->stripe_pages[i];
if (d)
@@ -309,6 +375,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
dest->stripe_pages[i] = s;
src->stripe_pages[i] = NULL;
}
+ index_stripe_sectors(dest);
+ index_stripe_sectors(src);
}
/*
@@ -600,39 +668,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return stripe * rbio->stripe_npages + index;
+ ASSERT(stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
+
+ return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
-/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+/* Return a sector from rbio->stripe_sectors, not from the bio list */
+static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
+ sector_nr)];
}
-/*
- * helper to index into the pstripe
- */
-static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside P stripe */
+static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
- return rbio_stripe_page(rbio, rbio->nr_data, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
}
-/*
- * helper to index into the qstripe, returns null
- * if there is no qstripe
- */
-static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside Q stripe, return NULL if not RAID6 */
+static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
- return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
}
/*
@@ -911,47 +979,43 @@ static void raid_write_end_io(struct bio *bio)
rbio_orig_end_io(rbio, err);
}
-/*
- * the read/modify/write code wants to use the original bio for
- * any pages it included, and then use the rbio for everything
- * else. This function decides if a given index (stripe number)
- * and page number in that stripe fall inside the original bio
- * or the rbio.
- *
- * if you set bio_list_only, you'll get a NULL back for any ranges
- * that are outside the bio_list
+/**
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr
*
- * This doesn't take any refs on anything, you get a bare page pointer
- * and the caller must bump refs as required.
+ * @rbio: The raid bio
+ * @stripe_nr: Stripe number, valid range [0, real_stripe)
+ * @sector_nr: Sector number inside the stripe,
+ * valid range [0, stripe_nsectors)
+ * @bio_list_only: Whether to use sectors inside the bio list only.
*
- * You must call index_rbio_pages once before you can trust
- * the answers from this function.
+ * The read/modify/write code wants to reuse the original bio page as much
+ * as possible, and only use stripe_sectors as fallback.
*/
-static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
- int index, int pagenr, int bio_list_only)
+static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- int chunk_page;
- struct page *p = NULL;
+ struct sector_ptr *sector;
+ int index;
+
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
- chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+ index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(index >= 0 && index < rbio->nr_sectors);
spin_lock_irq(&rbio->bio_list_lock);
- p = rbio->bio_pages[chunk_page];
+ sector = &rbio->bio_sectors[index];
+ if (sector->page || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (!sector->page)
+ sector = NULL;
+ spin_unlock_irq(&rbio->bio_list_lock);
+ return sector;
+ }
spin_unlock_irq(&rbio->bio_list_lock);
- if (p || bio_list_only)
- return p;
-
- return rbio->stripe_pages[chunk_page];
-}
-
-/*
- * number of pages we need for the entire stripe across all the
- * drives
- */
-static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
-{
- return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
+ return &rbio->stripe_sectors[index];
}
/*
@@ -960,22 +1024,28 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
struct btrfs_io_context *bioc,
- u64 stripe_len)
+ u32 stripe_len)
{
+ const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
+ const unsigned int num_pages = stripe_npages * real_stripes;
+ const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
+ const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- int num_pages = rbio_nr_pages(stripe_len, real_stripes);
- int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
+ ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
+ /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_pages) * num_pages +
+ sizeof(*rbio->bio_sectors) * num_sectors +
+ sizeof(*rbio->stripe_sectors) * num_sectors +
sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
- sizeof(*rbio->finish_pbitmap) *
- BITS_TO_LONGS(stripe_npages),
+ sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
+ sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -988,8 +1058,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
+ rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
+ rbio->stripe_nsectors = stripe_nsectors;
rbio->faila = -1;
rbio->failb = -1;
refcount_set(&rbio->refs, 1);
@@ -997,8 +1069,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages, bio_pages, etc arrays point to the extra
- * memory we allocated past the end of the rbio
+ * The stripe_pages, bio_sectors, etc arrays point to the extra memory
+ * we allocated past the end of the rbio.
*/
p = rbio + 1;
#define CONSUME_ALLOC(ptr, count) do { \
@@ -1006,10 +1078,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
} while (0)
CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
+ CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
+ CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
#undef CONSUME_ALLOC
if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
@@ -1026,59 +1099,63 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ int ret;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ if (ret < 0)
+ return ret;
+ /* Mapping all sectors */
+ index_stripe_sectors(rbio);
return 0;
}
/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
+ int ret;
- i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
+ rbio->stripe_pages + data_pages);
+ if (ret < 0)
+ return ret;
- for (; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * add a single page from a specific stripe into our list of bios for IO
- * this will try to merge into existing bios if possible, and returns
- * zero if all went well.
+ * Add a single sector @sector into our list of bios for IO.
+ *
+ * Return 0 if everything went well.
+ * Return <0 for error.
*/
-static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
-{
+static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct sector_ptr *sector,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ unsigned long bio_max_len,
+ unsigned int opf)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
struct btrfs_io_stripe *stripe;
u64 disk_start;
+ /*
+ * Note: here stripe_nr has taken device replace into consideration,
+ * thus it can be larger than rbio->real_stripe.
+ * So here we check against bioc->num_stripes, not rbio->real_stripes.
+ */
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT(sector->page);
+
stripe = &rbio->bioc->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+ disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1095,20 +1172,20 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, page, PAGE_SIZE, 0);
- if (ret == PAGE_SIZE)
+ ret = bio_add_page(last, sector->page, sectorsize,
+ sector->pgoff);
+ if (ret == sectorsize)
return 0;
}
}
/* put a new bio on the list */
- bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_bio(bio)->device = stripe->dev;
- bio->bi_iter.bi_size = 0;
- bio_set_dev(bio, stripe->dev->bdev);
+ bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
+ opf, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_private = rbio;
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1130,6 +1207,32 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
}
+static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+
+ if (bio_flagged(bio, BIO_CLONED))
+ bio->bi_iter = btrfs_bio(bio)->iter;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ u32 bvec_offset;
+
+ for (bvec_offset = 0; bvec_offset < bvec.bv_len;
+ bvec_offset += sectorsize, offset += sectorsize) {
+ int index = offset / sectorsize;
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+
+ sector->page = bvec.bv_page;
+ sector->pgoff = bvec.bv_offset + bvec_offset;
+ ASSERT(sector->pgoff < PAGE_SIZE);
+ }
+ }
+}
+
/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
@@ -1141,28 +1244,11 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- u64 start;
- unsigned long stripe_offset;
- unsigned long page_index;
spin_lock_irq(&rbio->bio_list_lock);
- bio_list_for_each(bio, &rbio->bio_list) {
- struct bio_vec bvec;
- struct bvec_iter iter;
- int i = 0;
+ bio_list_for_each(bio, &rbio->bio_list)
+ index_one_bio(rbio, bio);
- start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bioc->raid_map[0];
- page_index = stripe_offset >> PAGE_SHIFT;
-
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_bio(bio)->iter;
-
- bio_for_each_segment(bvec, bio, iter) {
- rbio->bio_pages[page_index + i] = bvec.bv_page;
- i++;
- }
- }
spin_unlock_irq(&rbio->bio_list_lock);
}
@@ -1177,10 +1263,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
+ int sectornr;
bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
@@ -1224,35 +1311,37 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
- /* first collect one page from each data stripe */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /* First collect one sector from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
- /* then add the parity stripe */
- p = rbio_pstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
if (has_qstripe) {
-
/*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
*/
- p = rbio_qstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
@@ -1264,18 +1353,20 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
* everything else.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
continue;
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_page(rbio, &bio_list,
- page, stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, rbio->stripe_len,
+ REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -1288,19 +1379,21 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
if (!bioc->tgtdev_map[stripe])
continue;
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
continue;
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_page(rbio, &bio_list, page,
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
rbio->bioc->tgtdev_map[stripe],
- pagenr, rbio->stripe_len);
+ sectornr, rbio->stripe_len,
+ REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -1311,9 +1404,7 @@ write_data:
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
@@ -1417,18 +1508,48 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
}
/*
+ * For subpage case, we can no longer set page Uptodate directly for
+ * stripe_pages[], thus we need to locate the sector.
+ */
+static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+ struct page *page,
+ unsigned int pgoff)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+ if (sector->page == page && sector->pgoff == pgoff)
+ return sector;
+ }
+ return NULL;
+}
+
+/*
* this sets each page in the bio uptodate. It should only be used on private
* rbio pages, nothing that comes in from the higher layers
*/
-static void set_bio_pages_uptodate(struct bio *bio)
+static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- SetPageUptodate(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct sector_ptr *sector;
+ int pgoff;
+
+ for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
+ pgoff += sectorsize) {
+ sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
+ }
+ }
}
/*
@@ -1446,7 +1567,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
@@ -1478,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -1496,28 +1617,30 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
+ * We want to find all the sectors missing from the
+ * rbio and read them from the disk. If * sector_in_rbio()
+ * finds a page in the bio list we don't need to read
+ * it off the stripe.
*/
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
/*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
+ * The bio cache may have handed us an uptodate page.
+ * If so, be happy and use it.
*/
- if (PageUptodate(page))
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret)
goto cleanup;
}
@@ -1540,9 +1663,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -1624,7 +1745,7 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct btrfs_work work;
+ struct work_struct work;
};
/*
@@ -1692,7 +1813,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
* if the unplug comes from schedule, we have to push the
* work off to a helper thread
*/
-static void unplug_work(struct btrfs_work *work)
+static void unplug_work(struct work_struct *work)
{
struct btrfs_plug_cb *plug;
plug = container_of(work, struct btrfs_plug_cb, work);
@@ -1705,9 +1826,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
- btrfs_queue_work(plug->info->rmw_workers,
- &plug->work);
+ INIT_WORK(&plug->work, unplug_work);
+ queue_work(plug->info->rmw_workers, &plug->work);
return;
}
run_plug(plug);
@@ -1716,8 +1836,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
@@ -1772,14 +1891,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
*/
static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
- int pagenr, stripe;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int sectornr, stripe;
void **pointers;
void **unmap_array;
int faila = -1, failb = -1;
- struct page *page;
blk_status_t err;
int i;
+ /*
+ * This array stores the pointer for each sector, thus it has the extra
+ * pgoff value added from each sector
+ */
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = BLK_STS_RESOURCE;
@@ -1808,43 +1931,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(pagenr, rbio->dbitmap))
+ !test_bit(sectornr, rbio->dbitmap))
continue;
/*
- * Setup our array of pointers with pages from each stripe
+ * Setup our array of pointers with sectors from each stripe
*
* NOTE: store a duplicate array of pointers to preserve the
* pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
- * if we're rebuilding a read, we have to use
+ * If we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- pointers[stripe] = kmap_local_page(page);
+ ASSERT(sector->page);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
unmap_array[stripe] = pointers[stripe];
}
- /* all raid6 handling here */
+ /* All raid6 handling here */
if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /*
- * single failure, rebuild from parity raid5
- * style
- */
+ /* Single failure, rebuild from parity raid5 style */
if (failb < 0) {
if (faila == rbio->nr_data) {
/*
@@ -1887,10 +2011,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
- PAGE_SIZE, faila, pointers);
+ sectorsize, faila, pointers);
} else {
raid6_2data_recov(rbio->real_stripes,
- PAGE_SIZE, faila, failb,
+ sectorsize, faila, failb,
pointers);
}
} else {
@@ -1900,7 +2024,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
BUG_ON(failb != -1);
pstripe:
/* Copy parity block into failed block to start with */
- copy_page(pointers[faila], pointers[rbio->nr_data]);
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1909,7 +2033,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -1918,14 +2042,14 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_npages; i++) {
+ for (i = 0; i < rbio->stripe_nsectors; i++) {
if (faila != -1) {
- page = rbio_stripe_page(rbio, faila, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, faila, i);
+ sector->uptodate = 1;
}
if (failb != -1) {
- page = rbio_stripe_page(rbio, failb, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, failb, i);
+ sector->uptodate = 1;
}
}
}
@@ -1998,7 +2122,7 @@ static void raid_recover_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
@@ -2023,7 +2147,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -2046,20 +2170,20 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
continue;
}
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
/*
* the rmw code may have already read this
* page in
*/
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list,
- rbio_stripe_page(rbio, stripe, pagenr),
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret < 0)
goto cleanup;
}
@@ -2086,9 +2210,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -2115,7 +2237,7 @@ cleanup:
* of the drive.
*/
int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io)
+ u32 stripe_len, int mirror_num, int generic_io)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
@@ -2193,7 +2315,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
}
-static void rmw_work(struct btrfs_work *work)
+static void rmw_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2201,7 +2323,7 @@ static void rmw_work(struct btrfs_work *work)
raid56_rmw_stripe(rbio);
}
-static void read_rebuild_work(struct btrfs_work *work)
+static void read_rebuild_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2221,7 +2343,7 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ u32 stripe_len, struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
@@ -2252,9 +2374,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
}
ASSERT(i < rbio->real_stripes);
- /* Now we just support the sectorsize equals to page size */
- ASSERT(fs_info->sectorsize == PAGE_SIZE);
- ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
@@ -2268,17 +2387,19 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
/* Used for both parity scrub and missing. */
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical)
+ unsigned int pgoff, u64 logical)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int stripe_offset;
int index;
ASSERT(logical >= rbio->bioc->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
+ ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
- index = stripe_offset >> PAGE_SHIFT;
- rbio->bio_pages[index] = page;
+ index = stripe_offset / sectorsize;
+ rbio->bio_sectors[index].page = page;
+ rbio->bio_sectors[index].pgoff = pgoff;
}
/*
@@ -2287,14 +2408,16 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- int bit;
- int index;
- struct page *page;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int stripe;
+ int sectornr;
+
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ struct page *page;
+ int index = (stripe * rbio->stripe_nsectors + sectornr) *
+ sectorsize >> PAGE_SHIFT;
- for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
- for (i = 0; i < rbio->real_stripes; i++) {
- index = i * rbio->stripe_npages + bit;
if (rbio->stripe_pages[index])
continue;
@@ -2304,6 +2427,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
rbio->stripe_pages[index] = page;
}
}
+ index_stripe_sectors(rbio);
return 0;
}
@@ -2311,14 +2435,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
+ int sectornr;
bool has_qstripe;
- struct page *p_page = NULL;
- struct page *q_page = NULL;
+ struct sector_ptr p_sector = { 0 };
+ struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
struct bio *bio;
int is_replace = 0;
@@ -2335,7 +2460,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2348,54 +2473,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS);
- if (!p_page)
+ p_sector.page = alloc_page(GFP_NOFS);
+ if (!p_sector.page)
goto cleanup;
- SetPageUptodate(p_page);
+ p_sector.pgoff = 0;
+ p_sector.uptodate = 1;
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_page = alloc_page(GFP_NOFS);
- if (!q_page) {
- __free_page(p_page);
+ q_sector.page = alloc_page(GFP_NOFS);
+ if (!q_sector.page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
goto cleanup;
}
- SetPageUptodate(q_page);
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
+ q_sector.pgoff = 0;
+ q_sector.uptodate = 1;
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_page);
+ pointers[nr_data] = kmap_local_page(p_sector.page);
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *p;
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
void *parity;
+
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
if (has_qstripe) {
/* RAID6, call the library function to fill in our P/Q */
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
/* Check scrubbing parity and repair it */
- p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap_local_page(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
- copy_page(parity, pointers[rbio->scrubp]);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ parity = kmap_local_page(sector->page) + sector->pgoff;
+ if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
+ memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, pagenr, 1);
+ bitmap_clear(rbio->dbitmap, sectornr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
@@ -2403,10 +2533,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
}
kunmap_local(pointers[nr_data]);
- __free_page(p_page);
- if (q_page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ if (q_sector.page) {
kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_page);
+ __free_page(q_sector.page);
+ q_sector.page = NULL;
}
writeback:
@@ -2415,12 +2547,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list,
- page, rbio->scrubp, pagenr, rbio->stripe_len);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ sectornr, rbio->stripe_len, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2428,13 +2560,13 @@ writeback:
if (!is_replace)
goto submit_write;
- for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list, page,
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
bioc->tgtdev_map[rbio->scrubp],
- pagenr, rbio->stripe_len);
+ sectornr, rbio->stripe_len, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2450,9 +2582,7 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
@@ -2548,7 +2678,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
@@ -2568,7 +2698,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -2584,28 +2714,29 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
/*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
+ * We want to find all the sectors missing from the
+ * rbio and read them from the disk. If * sector_in_rbio()
+ * finds a sector in the bio list we don't need to read
+ * it off the stripe.
*/
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
/*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
+ * The bio cache may have handed us an uptodate sector.
+ * If so, be happy and use it.
*/
- if (PageUptodate(page))
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret)
goto cleanup;
}
@@ -2628,9 +2759,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -2651,7 +2780,7 @@ finish:
validate_rbio_for_parity_scrub(rbio);
}
-static void scrub_parity_work(struct btrfs_work *work)
+static void scrub_parity_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 72c00fc284b5..aaad08aefd7d 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -31,15 +31,14 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io);
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len);
+ u32 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical);
+ unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
- struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_io_context *bioc, u32 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 998e3f180d90..c39f8b3a5a4a 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -614,14 +614,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
+ u64 range1_end = loff1 + len - 1;
+ u64 range2_end = loff2 + len - 1;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
+ swap(range1_end, range2_end);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
+ swap(range1_end, range2_end);
}
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
+ btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
}
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
@@ -771,7 +780,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
@@ -810,15 +818,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
wb_len = ALIGN(*len, bs);
/*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fdc2c4b411f0..a6dc827e75af 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
- root = (struct btrfs_root *)node->data;
+ root = node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
return btrfs_grab_root(root);
@@ -1101,7 +1101,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
continue;
/*
- * if we are modifying block in fs tree, wait for readpage
+ * if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
@@ -1563,7 +1563,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
end = (u64)-1;
}
- /* the lock_extent waits for readpage to complete */
+ /* the lock_extent waits for read_folio to complete */
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -2818,7 +2818,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* Subpage can't handle page with DIRTY but without UPTODATE
* bit as it can lead to the following deadlock:
*
- * btrfs_readpage()
+ * btrfs_read_folio()
* | Page already *locked*
* |- btrfs_lock_and_flush_ordered_range()
* |- btrfs_start_ordered_extent()
@@ -2967,11 +2967,12 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
if (PageReadahead(page))
- page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
- page_index, last_index + 1 - page_index);
+ page_cache_async_readahead(inode->i_mapping, ra, NULL,
+ page_folio(page), page_index,
+ last_index + 1 - page_index);
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (!PageUptodate(page)) {
ret = -EIO;
@@ -2997,7 +2998,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len, clamped_len);
+ clamped_len, clamped_len,
+ false);
if (ret)
goto release_page;
@@ -3845,8 +3847,7 @@ out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
- if (inode)
- iput(inode);
+ iput(inode);
inode = ERR_PTR(err);
}
return inode;
@@ -3977,6 +3978,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (!bg)
return -ENOENT;
+ /*
+ * Relocation of a data block group creates ordered extents. Without
+ * sb_start_write(), we can freeze the filesystem while unfinished
+ * ordered extents are left. Such ordered extents can cause a deadlock
+ * e.g. when syncfs() is waiting for their completion but they can't
+ * finish because they block when joining a transaction, due to the
+ * fact that the freeze locks are being held in write mode.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ ASSERT(sb_write_started(fs_info->sb));
+
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ca7426ef61c8..a64b26b16904 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
- qgroup_num_bytes, true);
+ qgroup_num_bytes, true,
+ false);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 8cd713d37ad2..e7b0323e6efd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -45,14 +45,14 @@ struct scrub_ctx;
* operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */
+#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
/*
* The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
struct scrub_recover {
refcount_t refs;
@@ -60,7 +60,7 @@ struct scrub_recover {
u64 map_length;
};
-struct scrub_page {
+struct scrub_sector {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
@@ -87,16 +87,16 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
- struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
- int page_count;
+ struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
+ int sector_count;
int next_free;
- struct btrfs_work work;
+ struct work_struct work;
};
struct scrub_block {
- struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
- int page_count;
- atomic_t outstanding_pages;
+ struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ int sector_count;
+ atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
@@ -110,7 +110,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
- struct btrfs_work work;
+ struct work_struct work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -129,10 +129,10 @@ struct scrub_parity {
refcount_t refs;
- struct list_head spages;
+ struct list_head sectors_list;
/* Work of parity check and repair */
- struct btrfs_work work;
+ struct work_struct work;
/* Mark the parity blocks which have data */
unsigned long *dbitmap;
@@ -158,7 +158,7 @@ struct scrub_ctx {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_bio;
+ int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -212,43 +212,43 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
- int page_num, int force_write);
+ int sector_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num);
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+ int sector_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_page_get(struct scrub_page *spage);
-static void scrub_page_put(struct scrub_page *spage);
+static void scrub_sector_get(struct scrub_sector *sector);
+static void scrub_sector_put(struct scrub_sector *sector);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum,
- u64 physical_for_dev_replace);
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_wr_bio_end_io_worker(struct work_struct *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
+static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
{
- return spage->recover &&
- (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return sector->recover &&
+ (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -535,9 +535,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->page_count; i++) {
- WARN_ON(!sbio->pagev[i]->page);
- scrub_block_put(sbio->pagev[i]->sblock);
+ for (i = 0; i < sbio->sector_count; i++) {
+ WARN_ON(!sbio->sectors[i]->page);
+ scrub_block_put(sbio->sectors[i]->sblock);
}
bio_put(sbio->bio);
}
@@ -572,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
+ sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -586,9 +586,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
- sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
- NULL);
+ sbio->sector_count = 0;
+ INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -728,16 +727,16 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u8 ref_level = 0;
int ret;
- WARN_ON(sblock->page_count < 1);
- dev = sblock->pagev[0]->dev;
+ WARN_ON(sblock->sector_count < 1);
+ dev = sblock->sectors[0]->dev;
fs_info = sblock->sctx->fs_info;
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->pagev[0]->physical;
- swarn.logical = sblock->pagev[0]->logical;
+ swarn.physical = sblock->sectors[0]->physical;
+ swarn.logical = sblock->sectors[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -798,8 +797,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
/*
* scrub_handle_errored_block gets called when either verification of the
- * pages failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all pages in the bio, even though only one
+ * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all sectors in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
@@ -817,16 +816,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
- int page_num;
+ int sector_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- BUG_ON(sblock_to_check->page_count < 1);
+ BUG_ON(sblock_to_check->sector_count < 1);
fs_info = sctx->fs_info;
- if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
@@ -837,13 +836,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sctx->stat_lock);
return 0;
}
- logical = sblock_to_check->pagev[0]->logical;
- BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0]->flags &
+ logical = sblock_to_check->sectors[0]->logical;
+ BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
+ failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
+ is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0]->have_csum;
- dev = sblock_to_check->pagev[0]->dev;
+ have_csum = sblock_to_check->sectors[0]->have_csum;
+ dev = sblock_to_check->sectors[0]->dev;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -854,7 +853,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
- * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
@@ -918,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
- /* setup the context, map the logical blocks and alloc the pages */
+ /* Setup the context, map the logical blocks and alloc the sectors */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
@@ -937,7 +936,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
- * the error disappeared after reading page by page, or
+ * The error disappeared after reading sector by sector, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
@@ -998,10 +997,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
- * checksum is present, only those pages are rewritten that had
+ * checksum is present, only those sectors are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
- * determined, which copy of the other pages is better (and it
- * could happen otherwise that a correct page would be
+ * determined, which copy of the other sectors is better (and it
+ * could happen otherwise that a correct sector would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
@@ -1011,25 +1010,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
- if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].page_count)
+ if (!sblocks_for_recheck[mirror_index].sector_count)
break;
sblock_other = sblocks_for_recheck + mirror_index;
} else {
- struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+ struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].page_count)
+ if (!sblocks_for_recheck[1].sector_count)
break;
ASSERT(failed_mirror_index == 0);
sblock_other = sblocks_for_recheck + 1;
- sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1078,16 +1077,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* area are unreadable.
*/
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
+ for (sector_num = 0; sector_num < sblock_bad->sector_count;
+ sector_num++) {
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_block *sblock_other = NULL;
- /* skip no-io-error page in scrub */
- if (!spage_bad->io_error && !sctx->is_dev_replace)
+ /* Skip no-io-error sectors in scrub */
+ if (!sector_bad->io_error && !sctx->is_dev_replace)
continue;
- if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
@@ -1096,14 +1095,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (spage_bad->io_error) {
- /* try to find no-io-error page in mirrors */
+ } else if (sector_bad->io_error) {
+ /* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
+ sblocks_for_recheck[mirror_index].sector_count > 0;
mirror_index++) {
if (!sblocks_for_recheck[mirror_index].
- pagev[page_num]->io_error) {
+ sectors[sector_num]->io_error) {
sblock_other = sblocks_for_recheck +
mirror_index;
break;
@@ -1115,27 +1114,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (sctx->is_dev_replace) {
/*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
+ * Did not find a mirror to fetch the sector from.
+ * scrub_write_sector_to_dev_replace() handles this
+ * case (sector->io_error), by filling the block with
+ * zeros before submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
- if (scrub_write_page_to_dev_replace(sblock_other,
- page_num) != 0) {
+ if (scrub_write_sector_to_dev_replace(sblock_other,
+ sector_num) != 0) {
atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
- ret = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_other,
- page_num, 0);
+ ret = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_other,
+ sector_num, 0);
if (0 == ret)
- spage_bad->io_error = 0;
+ sector_bad->io_error = 0;
else
success = 0;
}
@@ -1186,18 +1184,16 @@ out:
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
struct scrub_recover *recover;
- int page_index;
+ int i;
- for (page_index = 0; page_index < sblock->page_count;
- page_index++) {
- sblock->pagev[page_index]->sblock = NULL;
- recover = sblock->pagev[page_index]->recover;
+ for (i = 0; i < sblock->sector_count; i++) {
+ sblock->sectors[i]->sblock = NULL;
+ recover = sblock->sectors[i]->recover;
if (recover) {
scrub_put_recover(fs_info, recover);
- sblock->pagev[page_index]->recover =
- NULL;
+ sblock->sectors[i]->recover = NULL;
}
- scrub_page_put(sblock->pagev[page_index]);
+ scrub_sector_put(sblock->sectors[i]);
}
}
kfree(sblocks_for_recheck);
@@ -1255,26 +1251,25 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * fs_info->sectorsize;
- u64 logical = original_sblock->pagev[0]->logical;
- u64 generation = original_sblock->pagev[0]->generation;
- u64 flags = original_sblock->pagev[0]->flags;
- u64 have_csum = original_sblock->pagev[0]->have_csum;
+ u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = original_sblock->sectors[0]->logical;
+ u64 generation = original_sblock->sectors[0]->generation;
+ u64 flags = original_sblock->sectors[0]->flags;
+ u64 have_csum = original_sblock->sectors[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index = 0;
+ int sector_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
- * note: the two members refs and outstanding_pages
- * are not used (and not set) in the blocks that are used for
- * the recheck procedure
+ * Note: the two members refs and outstanding_sectors are not used (and
+ * not set) in the blocks that are used for the recheck procedure.
*/
while (length > 0) {
@@ -1306,20 +1301,20 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bioc = bioc;
recover->map_length = mapped_length;
- ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
- spage = kzalloc(sizeof(*spage), GFP_NOFS);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_NOFS);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1327,16 +1322,16 @@ leave_nomem:
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(spage);
- sblock->pagev[page_index] = spage;
- spage->sblock = sblock;
- spage->flags = flags;
- spage->generation = generation;
- spage->logical = logical;
- spage->have_csum = have_csum;
+ scrub_sector_get(sector);
+ sblock->sectors[sector_index] = sector;
+ sector->sblock = sblock;
+ sector->flags = flags;
+ sector->generation = generation;
+ sector->logical = logical;
+ sector->have_csum = have_csum;
if (have_csum)
- memcpy(spage->csum,
- original_sblock->pagev[0]->csum,
+ memcpy(sector->csum,
+ original_sblock->sectors[0]->csum,
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
@@ -1348,28 +1343,28 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bioc->stripes[stripe_index].physical +
+ sector->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bioc->stripes[stripe_index].dev;
+ sector->dev = bioc->stripes[stripe_index].dev;
- BUG_ON(page_index >= original_sblock->page_count);
- spage->physical_for_dev_replace =
- original_sblock->pagev[page_index]->
+ BUG_ON(sector_index >= original_sblock->sector_count);
+ sector->physical_for_dev_replace =
+ original_sblock->sectors[sector_index]->
physical_for_dev_replace;
- /* for missing devices, dev->bdev is NULL */
- spage->mirror_num = mirror_index + 1;
- sblock->page_count++;
- spage->page = alloc_page(GFP_NOFS);
- if (!spage->page)
+ /* For missing devices, dev->bdev is NULL */
+ sector->mirror_num = mirror_index + 1;
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_NOFS);
+ if (!sector->page)
goto leave_nomem;
scrub_get_recover(recover);
- spage->recover = recover;
+ sector->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
- page_index++;
+ sector_index++;
}
return 0;
@@ -1382,19 +1377,19 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *spage)
+ struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
int ret;
int mirror_num;
- bio->bi_iter.bi_sector = spage->logical >> 9;
+ bio->bi_iter.bi_sector = sector->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(bio, spage->recover->bioc,
- spage->recover->map_length,
+ mirror_num = sector->sblock->sectors[0]->mirror_num;
+ ret = raid56_parity_recover(bio, sector->recover->bioc,
+ sector->recover->map_length,
mirror_num, 0);
if (ret)
return ret;
@@ -1406,26 +1401,25 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
- struct scrub_page *first_page = sblock->pagev[0];
+ struct scrub_sector *first_sector = sblock->sectors[0];
struct bio *bio;
- int page_num;
+ int i;
- /* All pages in sblock belong to the same stripe on the same device. */
- ASSERT(first_page->dev);
- if (!first_page->dev->bdev)
+ /* All sectors in sblock belong to the same stripe on the same device. */
+ ASSERT(first_sector->dev);
+ if (!first_sector->dev->bdev)
goto out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
- bio_set_dev(bio, first_page->dev->bdev);
+ bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!spage->page);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ WARN_ON(!sector->page);
+ bio_add_page(bio, sector->page, PAGE_SIZE, 0);
}
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
bio_put(bio);
goto out;
}
@@ -1436,65 +1430,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
return;
out:
- for (page_num = 0; page_num < sblock->page_count; page_num++)
- sblock->pagev[page_num]->io_error = 1;
+ for (i = 0; i < sblock->sector_count; i++)
+ sblock->sectors[i]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
- * this function will check the on disk data for checksum errors, header
- * errors and read I/O errors. If any I/O errors happen, the exact pages
- * which are errored are marked as being bad. The goal is to enable scrub
- * to take those pages that are not errored from all the mirrors so that
- * the pages that are errored in the just handled mirror can be repaired.
+ * This function will check the on disk data for checksum errors, header errors
+ * and read I/O errors. If any I/O errors happen, the exact sectors which are
+ * errored are marked as being bad. The goal is to enable scrub to take those
+ * sectors that are not errored from all the mirrors so that the sectors that
+ * are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
- int page_num;
+ int i;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct bio *bio;
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+ struct bio bio;
+ struct bio_vec bvec;
- if (spage->dev->bdev == NULL) {
- spage->io_error = 1;
+ if (sector->dev->bdev == NULL) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!spage->page);
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage->dev->bdev);
-
- bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
- bio->bi_iter.bi_sector = spage->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ WARN_ON(!sector->page);
+ bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
+ bio.bi_iter.bi_sector = sector->physical >> 9;
- if (btrfsic_submit_bio_wait(bio)) {
- spage->io_error = 1;
+ btrfsic_check_bio(&bio);
+ if (submit_bio_wait(&bio)) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
}
- bio_put(bio);
+ bio_uninit(&bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
-static inline int scrub_check_fsid(u8 fsid[],
- struct scrub_page *spage)
+static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1507,7 +1499,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
sblock->checksum_error = 0;
sblock->generation_error = 0;
- if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
@@ -1516,15 +1508,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
- int page_num;
+ int i;
int ret = 0;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (i = 0; i < sblock_bad->sector_count; i++) {
int ret_sub;
- ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_good,
- page_num, 1);
+ ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_good, i, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -1532,47 +1523,43 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
return ret;
}
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int page_num, int force_write)
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write)
{
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
- struct scrub_page *spage_good = sblock_good->pagev[page_num];
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(spage_bad->page == NULL);
- BUG_ON(spage_good->page == NULL);
+ BUG_ON(sector_bad->page == NULL);
+ BUG_ON(sector_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || spage_bad->io_error) {
- struct bio *bio;
+ sblock_bad->checksum_error || sector_bad->io_error) {
+ struct bio bio;
+ struct bio_vec bvec;
int ret;
- if (!spage_bad->dev->bdev) {
+ if (!sector_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage_bad->dev->bdev);
- bio->bi_iter.bi_sector = spage_bad->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = sector_bad->physical >> 9;
+ __bio_add_page(&bio, sector_good->page, sectorsize, 0);
- ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
- if (ret != sectorsize) {
- bio_put(bio);
- return -EIO;
- }
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
- if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(spage_bad->dev,
+ if (ret) {
+ btrfs_dev_stat_inc_and_print(sector_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
- bio_put(bio);
return -EIO;
}
- bio_put(bio);
}
return 0;
@@ -1581,7 +1568,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- int page_num;
+ int i;
/*
* This block is used for the check of the parity on the source device,
@@ -1590,25 +1577,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
if (sblock->sparity)
return;
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ for (i = 0; i < sblock->sector_count; i++) {
int ret;
- ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ ret = scrub_write_sector_to_dev_replace(sblock, i);
if (ret)
atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num)
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
{
- struct scrub_page *spage = sblock->pagev[page_num];
+ struct scrub_sector *sector = sblock->sectors[sector_num];
- BUG_ON(spage->page == NULL);
- if (spage->io_error)
- clear_page(page_address(spage->page));
+ BUG_ON(sector->page == NULL);
+ if (sector->io_error)
+ clear_page(page_address(sector->page));
- return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+ return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -1633,8 +1619,8 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
struct scrub_bio *sbio;
int ret;
@@ -1650,45 +1636,38 @@ again:
return -ENOMEM;
}
sctx->wr_curr_bio->sctx = sctx;
- sctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sector_count = 0;
}
sbio = sctx->wr_curr_bio;
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- ret = fill_writer_pointer_gap(sctx,
- spage->physical_for_dev_replace);
+ if (sbio->sector_count == 0) {
+ ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
- sbio->physical = spage->physical_for_dev_replace;
- sbio->logical = spage->logical;
+ sbio->physical = sector->physical_for_dev_replace;
+ sbio->logical = sector->logical;
sbio->dev = sctx->wr_tgtdev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_WRITE, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_wr_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sector->physical_for_dev_replace ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sector->logical) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&sctx->wr_lock);
@@ -1698,10 +1677,10 @@ again:
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- scrub_page_get(spage);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ sbio->sectors[sbio->sector_count] = sector;
+ scrub_sector_get(sector);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1717,16 +1696,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->write_pointer = sbio->physical + sbio->sector_count *
sctx->fs_info->sectorsize;
}
@@ -1738,31 +1717,31 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
- btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+ INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+ queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
+ sector->io_error = 1;
atomic64_inc(&dev_replace->num_write_errors);
}
}
- for (i = 0; i < sbio->page_count; i++)
- scrub_page_put(sbio->pagev[i]);
+ for (i = 0; i < sbio->sector_count; i++)
+ scrub_sector_put(sbio->sectors[i]);
bio_put(sbio->bio);
kfree(sbio);
@@ -1786,8 +1765,8 @@ static int scrub_checksum(struct scrub_block *sblock)
sblock->generation_error = 0;
sblock->checksum_error = 0;
- WARN_ON(sblock->page_count < 1);
- flags = sblock->pagev[0]->flags;
+ WARN_ON(sblock->sector_count < 1);
+ flags = sblock->sectors[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
@@ -1809,26 +1788,26 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- if (!spage->have_csum)
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ if (!sector->have_csum)
return 0;
- kaddr = page_address(spage->page);
+ kaddr = page_address(sector->page);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
/*
- * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
+ * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
* only contains one sector of data.
*/
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- if (memcmp(csum, spage->csum, fs_info->csum_size))
+ if (memcmp(csum, sector->csum, fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
}
@@ -1849,16 +1828,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
const u32 sectorsize = sctx->fs_info->sectorsize;
const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
+ BUG_ON(sblock->sector_count < 1);
- /* Each member in pagev is just one block, not a full page */
- ASSERT(sblock->page_count == num_sectors);
+ /* Each member in sectors is just one sector */
+ ASSERT(sblock->sector_count == num_sectors);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ sector = sblock->sectors[0];
+ kaddr = page_address(sector->page);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
@@ -1867,15 +1846,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (spage->logical != btrfs_stack_header_bytenr(h))
+ if (sector->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (spage->generation != btrfs_stack_header_generation(h)) {
+ if (sector->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, spage))
+ if (!scrub_check_fsid(h->fsid, sector))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
@@ -1888,7 +1867,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
- kaddr = page_address(sblock->pagev[i]->page);
+ kaddr = page_address(sblock->sectors[i]->page);
crypto_shash_update(shash, kaddr, sectorsize);
}
@@ -1906,23 +1885,23 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ kaddr = page_address(sector->page);
s = (struct btrfs_super_block *)kaddr;
- if (spage->logical != btrfs_super_bytenr(s))
+ if (sector->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (spage->generation != btrfs_super_generation(s))
+ if (sector->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, spage))
+ if (!scrub_check_fsid(s->fsid, sector))
++fail_cor;
shash->tfm = fs_info->csum_shash;
@@ -1943,10 +1922,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(spage->dev,
+ btrfs_dev_stat_inc_and_print(sector->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(spage->dev,
+ btrfs_dev_stat_inc_and_print(sector->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -1966,23 +1945,23 @@ static void scrub_block_put(struct scrub_block *sblock)
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
- for (i = 0; i < sblock->page_count; i++)
- scrub_page_put(sblock->pagev[i]);
+ for (i = 0; i < sblock->sector_count; i++)
+ scrub_sector_put(sblock->sectors[i]);
kfree(sblock);
}
}
-static void scrub_page_get(struct scrub_page *spage)
+static void scrub_sector_get(struct scrub_sector *sector)
{
- atomic_inc(&spage->refs);
+ atomic_inc(&sector->refs);
}
-static void scrub_page_put(struct scrub_page *spage)
+static void scrub_sector_put(struct scrub_sector *sector)
{
- if (atomic_dec_and_test(&spage->refs)) {
- if (spage->page)
- __free_page(spage->page);
- kfree(spage);
+ if (atomic_dec_and_test(&sector->refs)) {
+ if (sector->page)
+ __free_page(sector->page);
+ kfree(sector);
}
}
@@ -2057,13 +2036,14 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
}
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
- struct scrub_block *sblock = spage->sblock;
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
@@ -2078,7 +2058,7 @@ again:
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
- sctx->bios[sctx->curr]->page_count = 0;
+ sctx->bios[sctx->curr]->sector_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
@@ -2086,37 +2066,31 @@ again:
}
}
sbio = sctx->bios[sctx->curr];
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- sbio->physical = spage->physical;
- sbio->logical = spage->logical;
- sbio->dev = spage->dev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (sbio->sector_count == 0) {
+ sbio->physical = sector->physical;
+ sbio->logical = sector->logical;
+ sbio->dev = sector->dev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_READ, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical ||
- sbio->dev != spage->dev) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sector->physical ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sector->logical ||
+ sbio->dev != sector->dev) {
scrub_submit(sctx);
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ sbio->sectors[sbio->sector_count] = sector;
+ ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
@@ -2126,9 +2100,9 @@ again:
}
scrub_block_get(sblock); /* one for the page added to the bio */
- atomic_inc(&sblock->outstanding_pages);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ atomic_inc(&sblock->outstanding_sectors);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_submit(sctx);
return 0;
@@ -2144,10 +2118,10 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
bio_put(bio);
- btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+ queue_work(fs_info->scrub_workers, &sblock->work);
}
-static void scrub_missing_raid56_worker(struct btrfs_work *work)
+static void scrub_missing_raid56_worker(struct work_struct *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
@@ -2155,8 +2129,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->pagev[0]->logical;
- dev = sblock->pagev[0]->dev;
+ logical = sblock->sectors[0]->logical;
+ dev = sblock->sectors[0]->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2193,8 +2167,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = sblock->page_count * PAGE_SIZE;
- u64 logical = sblock->pagev[0]->logical;
+ u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = sblock->sectors[0]->logical;
struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
@@ -2213,12 +2187,12 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
- * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ * there's a bug in scrub_find_good_copy()/btrfs_map_block().
*/
goto bioc_out;
}
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -2227,13 +2201,17 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
if (!rbio)
goto rbio_out;
- for (i = 0; i < sblock->page_count; i++) {
- struct scrub_page *spage = sblock->pagev[i];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ /*
+ * For now, our scrub is still one page per sector, so pgoff
+ * is always 0.
+ */
+ raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
}
- btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
+ INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
@@ -2249,7 +2227,7 @@ bioc_out:
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
@@ -2273,7 +2251,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
/*
* Here we will allocate one page for one sector to scrub.
* This is fine if PAGE_SIZE == sectorsize, but will cost
@@ -2281,8 +2259,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2290,26 +2268,26 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->physical_for_dev_replace = physical_for_dev_replace;
- spage->mirror_num = mirror_num;
+ ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
+ scrub_sector_get(sector);
+ sblock->sectors[index] = sector;
+ sector->sblock = sblock;
+ sector->dev = dev;
+ sector->flags = flags;
+ sector->generation = gen;
+ sector->logical = logical;
+ sector->physical = physical;
+ sector->physical_for_dev_replace = physical_for_dev_replace;
+ sector->mirror_num = mirror_num;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_KERNEL);
+ if (!sector->page)
goto leave_nomem;
len -= l;
logical += l;
@@ -2317,7 +2295,7 @@ leave_nomem:
physical_for_dev_replace += l;
}
- WARN_ON(sblock->page_count == 0);
+ WARN_ON(sblock->sector_count == 0);
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
@@ -2325,11 +2303,11 @@ leave_nomem:
*/
scrub_missing_raid56_pages(sblock);
} else {
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
@@ -2353,31 +2331,31 @@ static void scrub_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+ queue_work(fs_info->scrub_workers, &sbio->work);
}
-static void scrub_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
- spage->sblock->no_io_error_seen = 0;
+ sector->io_error = 1;
+ sector->sblock->no_io_error_seen = 0;
}
}
- /* now complete the scrub_block items that have all pages completed */
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
- struct scrub_block *sblock = spage->sblock;
+ /* Now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+ struct scrub_block *sblock = sector->sblock;
- if (atomic_dec_and_test(&sblock->outstanding_pages))
+ if (atomic_dec_and_test(&sblock->outstanding_sectors))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
@@ -2456,8 +2434,8 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->pagev[0]->logical;
- u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ u64 start = sblock->sectors[0]->logical;
+ u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
@@ -2532,8 +2510,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u64 physical_for_dev_replace)
+ u64 gen, int mirror_num)
{
+ struct btrfs_device *src_dev = dev;
+ u64 src_physical = physical;
+ int src_mirror = mirror_num;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
@@ -2561,6 +2542,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
WARN_ON(1);
}
+ /*
+ * For dev-replace case, we can have @dev being a missing device.
+ * Regular scrub will avoid its execution on missing device at all,
+ * as that would trigger tons of read error.
+ *
+ * Reading from missing device will cause read error counts to
+ * increase unnecessarily.
+ * So here we change the read source to a good mirror.
+ */
+ if (sctx->is_dev_replace && !dev->bdev)
+ scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+ &src_dev, &src_mirror);
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
@@ -2571,20 +2564,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
if (have_csum == 0)
++sctx->stat.no_csum;
}
- ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL,
- physical_for_dev_replace);
+ ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+ flags, gen, src_mirror,
+ have_csum ? csum : NULL, physical);
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
- physical_for_dev_replace += l;
+ src_physical += l;
}
return 0;
}
-static int scrub_pages_for_parity(struct scrub_parity *sparity,
+static int scrub_sectors_for_parity(struct scrub_parity *sparity,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
@@ -2613,10 +2606,10 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2624,29 +2617,29 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
/* For scrub block */
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
+ scrub_sector_get(sector);
+ sblock->sectors[index] = sector;
/* For scrub parity */
- scrub_page_get(spage);
- list_add_tail(&spage->list, &sparity->spages);
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->mirror_num = mirror_num;
+ scrub_sector_get(sector);
+ list_add_tail(&sector->list, &sparity->sectors_list);
+ sector->sblock = sblock;
+ sector->dev = dev;
+ sector->flags = flags;
+ sector->generation = gen;
+ sector->logical = logical;
+ sector->physical = physical;
+ sector->mirror_num = mirror_num;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_KERNEL);
+ if (!sector->page)
goto leave_nomem;
@@ -2656,19 +2649,19 @@ leave_nomem:
physical += sectorsize;
}
- WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ WARN_ON(sblock->sector_count == 0);
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
- /* last one frees, either here or in bio completion for last page */
+ /* Last one frees, either here or in bio completion for last sector */
scrub_block_put(sblock);
return 0;
}
@@ -2707,7 +2700,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (have_csum == 0)
goto skip;
}
- ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
@@ -2767,7 +2760,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_page *curr, *next;
+ struct scrub_sector *curr, *next;
int nbits;
nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
@@ -2778,15 +2771,15 @@ static void scrub_free_parity(struct scrub_parity *sparity)
spin_unlock(&sctx->stat_lock);
}
- list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
list_del_init(&curr->list);
- scrub_page_put(curr);
+ scrub_sector_put(curr);
}
kfree(sparity);
}
-static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+static void scrub_parity_bio_endio_worker(struct work_struct *work)
{
struct scrub_parity *sparity = container_of(work, struct scrub_parity,
work);
@@ -2798,7 +2791,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
static void scrub_parity_bio_endio(struct bio *bio)
{
- struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_parity *sparity = bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
@@ -2807,9 +2800,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
bio_put(bio);
- btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
- NULL);
- btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
+ INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+ queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2834,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (ret || !bioc || !bioc->raid_map)
goto bioc_out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
@@ -2882,6 +2874,251 @@ static void scrub_parity_put(struct scrub_parity *sparity)
scrub_parity_check_and_repair(sparity);
}
+/*
+ * Return 0 if the extent item range covers any byte of the range.
+ * Return <0 if the extent item is before @search_start.
+ * Return >0 if the extent item is after @start_start + @search_len.
+ */
+static int compare_extent_item_range(struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
+ u64 len;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ len = fs_info->nodesize;
+ else
+ len = key.offset;
+
+ if (key.objectid + len <= search_start)
+ return -1;
+ if (key.objectid >= search_start + search_len)
+ return 1;
+ return 0;
+}
+
+/*
+ * Locate one extent item which covers any byte in range
+ * [@search_start, @search_start + @search_length)
+ *
+ * If the path is not initialized, we will initialize the search by doing
+ * a btrfs_search_slot().
+ * If the path is already initialized, we will use the path as the initial
+ * slot, to avoid duplicated btrfs_search_slot() calls.
+ *
+ * NOTE: If an extent item starts before @search_start, we will still
+ * return the extent item. This is for data extent crossing stripe boundary.
+ *
+ * Return 0 if we found such extent item, and @path will point to the extent item.
+ * Return >0 if no such extent item can be found, and @path will be released.
+ * Return <0 if hit fatal error, and @path will be released.
+ */
+static int find_first_extent_item(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_key key;
+ int ret;
+
+ /* Continue using the existing path */
+ if (path->nodes[0])
+ goto search_forward;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = search_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(ret > 0);
+ /*
+ * Here we intentionally pass 0 as @min_objectid, as there could be
+ * an extent item starting before @search_start.
+ */
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * No matter whether we have found an extent item, the next loop will
+ * properly do every check on the key.
+ */
+search_forward:
+ while (true) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid >= search_start + search_len)
+ break;
+ if (key.type != BTRFS_METADATA_ITEM_KEY &&
+ key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ ret = compare_extent_item_range(path, search_start, search_len);
+ if (ret == 0)
+ return ret;
+ if (ret > 0)
+ break;
+next:
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret) {
+ /* Either no more item or fatal error */
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ return 1;
+}
+
+static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_EXTENT_ITEM_KEY);
+ *extent_start_ret = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ *size_ret = path->nodes[0]->fs_info->nodesize;
+ else
+ *size_ret = key.offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
+ *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
+ *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+}
+
+static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+ u64 boundary_start, u64 boudary_len)
+{
+ return (extent_start < boundary_start &&
+ extent_start + extent_len > boundary_start) ||
+ (extent_start < boundary_start + boudary_len &&
+ extent_start + extent_len > boundary_start + boudary_len);
+}
+
+static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+ struct scrub_parity *sparity,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logical)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+ u64 cur_logical = logical;
+ int ret;
+
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+
+ /* Path must not be populated */
+ ASSERT(!path->nodes[0]);
+
+ while (cur_logical < logical + map->stripe_len) {
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_device *extent_dev;
+ u64 extent_start;
+ u64 extent_size;
+ u64 mapped_length;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 extent_physical;
+ u64 extent_mirror_num;
+
+ ret = find_first_extent_item(extent_root, path, cur_logical,
+ logical + map->stripe_len - cur_logical);
+ /* No more extent item in this data stripe */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+
+ /* Metadata should not cross stripe boundaries */
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_size,
+ logical, map->stripe_len)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ extent_start, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += extent_size;
+ continue;
+ }
+
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /* Truncate the range inside this data stripe */
+ extent_size = min(extent_start + extent_size,
+ logical + map->stripe_len) - cur_logical;
+ extent_start = cur_logical;
+ ASSERT(extent_size <= U32_MAX);
+
+ scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+
+ mapped_length = extent_size;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+ &mapped_length, &bioc, 0);
+ if (!ret && (!bioc || mapped_length < extent_size))
+ ret = -EIO;
+ if (ret) {
+ btrfs_put_bioc(bioc);
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+
+ ret = btrfs_lookup_csums_range(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1);
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ ret = scrub_extent_for_parity(sparity, extent_start,
+ extent_size, extent_physical,
+ extent_dev, extent_flags,
+ extent_gen, extent_mirror_num);
+ scrub_free_csums(sctx);
+
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ cond_resched();
+ cur_logical += extent_size;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
@@ -2889,28 +3126,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
- struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
- struct btrfs_io_context *bioc = NULL;
struct btrfs_path *path;
- u64 flags;
+ u64 cur_logical;
int ret;
- int slot;
- struct extent_buffer *l;
- struct btrfs_key key;
- u64 generation;
- u64 extent_logical;
- u64 extent_physical;
- /* Check the comment in scrub_stripe() for why u32 is enough here */
- u32 extent_len;
- u64 mapped_length;
- struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
int bitmap_len;
- int extent_mirror_num;
- int stop_loop = 0;
path = btrfs_alloc_path();
if (!path) {
@@ -2943,178 +3164,19 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
- INIT_LIST_HEAD(&sparity->spages);
+ INIT_LIST_HEAD(&sparity->sectors_list);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
ret = 0;
- while (logic_start < logic_end) {
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logic_start;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ for (cur_logical = logic_start; cur_logical < logic_end;
+ cur_logical += map->stripe_len) {
+ ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+ sdev, path, cur_logical);
if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logic_start)
- goto next;
-
- if (key.objectid >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- while (key.objectid >= logic_start + map->stripe_len)
- logic_start += map->stripe_len;
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logic_start ||
- key.objectid + bytes >
- logic_start + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- if (extent_logical < logic_start) {
- extent_len -= logic_start - extent_logical;
- extent_logical = logic_start;
- }
-
- if (extent_logical + extent_len >
- logic_start + map->stripe_len)
- extent_len = logic_start + map->stripe_len -
- extent_logical;
-
- scrub_parity_mark_sectors_data(sparity, extent_logical,
- extent_len);
-
- mapped_length = extent_len;
- bioc = NULL;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bioc,
- 0);
- if (!ret) {
- if (!bioc || mapped_length < extent_len)
- ret = -EIO;
- }
- if (ret) {
- btrfs_put_bioc(bioc);
- goto out;
- }
- extent_physical = bioc->stripes[0].physical;
- extent_mirror_num = bioc->mirror_num;
- extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
-
- csum_root = btrfs_csum_root(fs_info, extent_logical);
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
-
- ret = scrub_extent_for_parity(sparity, extent_logical,
- extent_len,
- extent_physical,
- extent_dev, flags,
- generation,
- extent_mirror_num);
-
- scrub_free_csums(sctx);
-
- if (ret)
- goto out;
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- logic_start += map->stripe_len;
-
- if (logic_start >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- if (logic_start < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
- }
-next:
- path->slots[0]++;
- }
-
- btrfs_release_path(path);
-
- if (stop_loop)
break;
-
- logic_start += map->stripe_len;
- }
-out:
- if (ret < 0) {
- ASSERT(logic_end - logic_start <= U32_MAX);
- scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start);
}
+
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
@@ -3165,6 +3227,206 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
return ret;
}
+/*
+ * Scrub one range which can only has simple mirror based profile.
+ * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
+ * RAID0/RAID10).
+ *
+ * Since we may need to handle a subset of block group, we need @logical_start
+ * and @logical_length parameter.
+ */
+static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 logical_start, u64 logical_length,
+ struct btrfs_device *device,
+ u64 physical, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ const u64 logical_end = logical_start + logical_length;
+ /* An artificial limit, inherit from old scrub behavior */
+ const u32 max_length = SZ_64K;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ int ret;
+
+ /* The range must be inside the bg */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+ /* Go through each extent items inside the logical range */
+ while (cur_logical < logical_end) {
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 scrub_len;
+
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ sctx->flush_all_writes = false;
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (bg->removed) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
+
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ logical_end - cur_logical);
+ if (ret > 0) {
+ /* No more extent, just update the accounting */
+ sctx->stat.last_physical = physical + logical_length;
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Scrub len has three limits:
+ * - Extent size limit
+ * - Scrub range limit
+ * This is especially imporatant for RAID0/RAID10 to reuse
+ * this function
+ * - Max scrub size limit
+ */
+ scrub_len = min(min(extent_start + extent_len,
+ logical_end), cur_logical + max_length) -
+ cur_logical;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ cur_logical + scrub_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ break;
+ }
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_len,
+ logical_start, logical_length)) {
+ btrfs_err(fs_info,
+"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+ extent_start, logical_start, logical_end);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += scrub_len;
+ continue;
+ }
+ ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+ cur_logical - logical_start + physical,
+ device, extent_flags, extent_gen,
+ mirror_num);
+ scrub_free_csums(sctx);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+ cur_logical += scrub_len;
+ /* Don't hold CPU for too long time */
+ cond_resched();
+ }
+ btrfs_release_path(&path);
+ return ret;
+}
+
+/* Calculate the full stripe length for simple stripe based profiles */
+static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+
+ return map->num_stripes / map->sub_stripes * map->stripe_len;
+}
+
+/* Get the logical bytenr for the stripe */
+static u64 simple_stripe_get_logical(struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /*
+ * (stripe_index / sub_stripes) gives how many data stripes we need to
+ * skip.
+ */
+ return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+}
+
+/* Get the mirror number for the stripe */
+static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
+ return stripe_index % map->sub_stripes + 1;
+}
+
+static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct btrfs_device *device,
+ int stripe_index)
+{
+ const u64 logical_increment = simple_stripe_full_stripe_len(map);
+ const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
+ const u64 orig_physical = map->stripes[stripe_index].physical;
+ const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
+ u64 cur_logical = orig_logical;
+ u64 cur_physical = orig_physical;
+ int ret = 0;
+
+ while (cur_logical < bg->start + bg->length) {
+ /*
+ * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
+ * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ * this stripe.
+ */
+ ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+ cur_logical, map->stripe_len, device,
+ cur_physical, mirror_num);
+ if (ret)
+ return ret;
+ /* Skip to next stripe which belongs to the target device */
+ cur_logical += logical_increment;
+ /* For physical offset, we just go to next stripe */
+ cur_physical += map->stripe_len;
+ }
+ return ret;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
struct map_lookup *map,
@@ -3175,59 +3437,22 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root;
struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
struct blk_plug plug;
+ const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
- u64 flags;
int ret;
- int slot;
- u64 nstripes;
- struct extent_buffer *l;
- u64 physical;
+ u64 physical = map->stripes[stripe_index].physical;
+ const u64 physical_end = physical + dev_extent_len;
u64 logical;
u64 logic_end;
- u64 physical_end;
- u64 generation;
- int mirror_num;
- struct btrfs_key key;
+ /* The logical increment after finishing one stripe */
u64 increment;
+ /* Offset inside the chunk */
u64 offset;
- u64 extent_logical;
- u64 extent_physical;
- /*
- * Unlike chunk length, extent length should never go beyond
- * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
- */
- u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
- struct btrfs_device *extent_dev;
- int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[stripe_index].physical;
- offset = 0;
- nstripes = div64_u64(dev_extent_len, map->stripe_len);
- mirror_num = 1;
- increment = map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * stripe_index;
- increment = map->stripe_len * map->num_stripes;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
- increment = map->stripe_len * factor;
- mirror_num = stripe_index % map->sub_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, stripe_index, map, &offset,
- NULL);
- increment = map->stripe_len * nr_data_stripes(map);
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3241,21 +3466,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
path->skip_locking = 1;
path->reada = READA_FORWARD;
- logical = chunk_logical + offset;
- physical_end = physical + nstripes * map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, stripe_index,
- map, &logic_end, NULL);
- logic_end += chunk_logical;
- } else {
- logic_end = logical + increment * nstripes;
- }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- root = btrfs_extent_root(fs_info, logical);
- csum_root = btrfs_csum_root(fs_info, logical);
+ root = btrfs_extent_root(fs_info, bg->start);
+ csum_root = btrfs_csum_root(fs_info, bg->start);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3272,241 +3488,83 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
/*
- * now find all extents for each stripe and scrub them
+ * There used to be a big double loop to handle all profiles using the
+ * same routine, which grows larger and more gross over time.
+ *
+ * So here we handle each profile differently, so simpler profiles
+ * have simpler scrubbing function.
*/
- ret = 0;
- while (physical < physical_end) {
- /*
- * canceled?
- */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
- goto out;
- }
+ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
- * check to see if we have to pause
+ * Above check rules out all complex profile, the remaining
+ * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
+ * mirrored duplication without stripe.
+ *
+ * Only @physical and @mirror_num needs to calculated using
+ * @stripe_index.
*/
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* push queued extents */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
- sctx->flush_all_writes = false;
- scrub_blocked_if_needed(fs_info);
- }
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, stripe_index,
- map, &logical,
- &stripe_logical);
- logical += chunk_logical;
- if (ret) {
- /* it is parity strip */
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical + increment;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
-
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- /* there's no smaller item, so stick with the
- * larger one */
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logical)
- goto next;
-
- if (key.objectid >= logical + map->stripe_len) {
- /* out of this device extent */
- if (key.objectid >= logic_end)
- stop_loop = 1;
- break;
- }
-
- /*
- * If our block group was removed in the meanwhile, just
- * stop scrubbing since there is no point in continuing.
- * Continuing would prevent reusing its device extents
- * for new block groups for a long time.
- */
- spin_lock(&bg->lock);
- if (bg->removed) {
- spin_unlock(&bg->lock);
- ret = 0;
- goto out;
- }
- spin_unlock(&bg->lock);
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logical ||
- key.objectid + bytes >
- logical + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logical);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- /*
- * trim extent to this stripe
- */
- if (extent_logical < logical) {
- extent_len -= logical - extent_logical;
- extent_logical = logical;
- }
- if (extent_logical + extent_len >
- logical + map->stripe_len) {
- extent_len = logical + map->stripe_len -
- extent_logical;
- }
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ bg->start, bg->length, scrub_dev,
+ map->stripes[stripe_index].physical,
+ stripe_index + 1);
+ offset = 0;
+ goto out;
+ }
+ if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+ scrub_dev, stripe_index);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ goto out;
+ }
- extent_physical = extent_logical - logical + physical;
- extent_dev = scrub_dev;
- extent_mirror_num = mirror_num;
- if (sctx->is_dev_replace)
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
-
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
- }
+ /* Only RAID56 goes through the old code */
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ ret = 0;
- ret = scrub_extent(sctx, map, extent_logical, extent_len,
- extent_physical, extent_dev, flags,
- generation, extent_mirror_num,
- extent_logical - logical + physical);
+ /* Calculate the logical end of the stripe */
+ get_raid56_logic_offset(physical_end, stripe_index,
+ map, &logic_end, NULL);
+ logic_end += chunk_logical;
- scrub_free_csums(sctx);
+ /* Initialize @offset in case we need to go to out: label */
+ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+ /*
+ * Due to the rotation, for RAID56 it's better to iterate each stripe
+ * using their physical offset.
+ */
+ while (physical < physical_end) {
+ ret = get_raid56_logic_offset(physical, stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += chunk_logical;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ stripe_logical,
+ stripe_end);
if (ret)
goto out;
+ goto next;
+ }
- if (sctx->is_dev_replace)
- sync_replace_for_zoned(sctx);
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /*
- * loop until we find next data stripe
- * or we have finished all stripes.
- */
-loop:
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(physical,
- stripe_index, map,
- &logical, &stripe_logical);
- logical += chunk_logical;
-
- if (ret && physical < physical_end) {
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical +
- increment;
- ret = scrub_raid56_parity(sctx,
- map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto loop;
- }
- } else {
- physical += map->stripe_len;
- logical += increment;
- }
- if (logical < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
-
- if (physical >= physical_end) {
- stop_loop = 1;
- break;
- }
- }
+ /*
+ * Now we're at a data stripe, scrub each extents in the range.
+ *
+ * At this stage, if we ignore the repair part, inside each data
+ * stripe it is no different than SINGLE profile.
+ * We can reuse scrub_simple_mirror() here, as the repair part
+ * is still based on @mirror_num.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ logical, map->stripe_len,
+ scrub_dev, physical, 1);
+ if (ret < 0)
+ goto out;
next:
- path->slots[0]++;
- }
- btrfs_release_path(path);
-skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
@@ -3964,9 +4022,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
- ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, bytenr);
+ ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3979,22 +4037,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
&fs_info->scrub_lock)) {
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
-
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
+ struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ struct workqueue_struct *scrub_wr_comp =
+ fs_info->scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity =
+ fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
fs_info->scrub_parity_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ if (scrub_workers)
+ destroy_workqueue(scrub_workers);
+ if (scrub_wr_comp)
+ destroy_workqueue(scrub_wr_comp);
+ if (scrub_parity)
+ destroy_workqueue(scrub_parity);
}
}
@@ -4004,9 +4063,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
+ struct workqueue_struct *scrub_workers = NULL;
+ struct workqueue_struct *scrub_wr_comp = NULL;
+ struct workqueue_struct *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
@@ -4014,18 +4073,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
- is_dev_replace ? 1 : max_active, 4);
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags,
+ is_dev_replace ? 1 : max_active);
if (!scrub_workers)
goto fail_scrub_workers;
- scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
+ scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
if (!scrub_wr_comp)
goto fail_scrub_wr_completion_workers;
- scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
- max_active, 2);
+ scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
if (!scrub_parity)
goto fail_scrub_parity_workers;
@@ -4046,11 +4103,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->scrub_lock);
ret = 0;
- btrfs_destroy_workqueue(scrub_parity);
+ destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(scrub_wr_comp);
+ destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(scrub_workers);
+ destroy_workqueue(scrub_workers);
fail_scrub_workers:
return ret;
}
@@ -4082,18 +4139,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (fs_info->nodesize >
- PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
+ fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
/*
- * would exhaust the array bounds of pagev member in
+ * Would exhaust the array bounds of sectorv member in
* struct scrub_block
*/
btrfs_err(fs_info,
- "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize,
- SCRUB_MAX_PAGES_PER_BLOCK,
- fs_info->sectorsize,
- SCRUB_MAX_PAGES_PER_BLOCK);
+"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
+ fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
+ fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
return -EINVAL;
}
@@ -4161,7 +4216,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
/*
* In order to avoid deadlock with reclaim when there is a transaction
* trying to pause scrub, make sure we use GFP_NOFS for all the
- * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
* invoked by our callees. The pausing request is done when the
* transaction commit starts, and it blocks the transaction until scrub
* is paused (done at specific points at scrub_stripe() or right above
@@ -4295,11 +4350,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
{
u64 mapped_length;
struct btrfs_io_context *bioc = NULL;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7d1642937274..fa56890ff81f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -10,7 +10,6 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
-#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
@@ -128,11 +127,18 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct radix_tree_root name_cache;
+ struct xarray name_cache;
struct list_head name_cache_list;
int name_cache_size;
+ /*
+ * The inode we are currently processing. It's not NULL only when we
+ * need to issue write commands for data extents from this inode.
+ */
+ struct inode *cur_inode;
struct file_ra_state ra;
+ u64 page_cache_clear_start;
+ bool clean_page_cache;
/*
* We process inodes by their increasing order, so if before an
@@ -262,14 +268,13 @@ struct orphan_dir_info {
struct name_cache_entry {
struct list_head list;
/*
- * radix_tree has only 32bit entries but we need to handle 64bit inums.
- * We use the lower 32bit of the 64bit inum to store it in the tree. If
- * more then one inum would fall into the same entry, we use radix_list
- * to store the additional entries. radix_list is also used to store
- * entries where two entries have the same inum but different
- * generations.
+ * On 32bit kernels, xarray has only 32bit indices, but we need to
+ * handle 64bit inums. We use the lower 32bit of the 64bit inum to store
+ * it in the tree. If more than one inum would fall into the same entry,
+ * we use inum_aliases to store the additional entries. inum_aliases is
+ * also used to store entries with the same inum but different generations.
*/
- struct list_head radix_list;
+ struct list_head inum_aliases;
u64 ino;
u64 gen;
u64 parent_ino;
@@ -2019,9 +2024,9 @@ out:
}
/*
- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
* so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::radix_list.
+ * takes care of this with the help of name_cache_entry::inum_aliases.
* In case of error, nce is kfreed.
*/
static int name_cache_insert(struct send_ctx *sctx,
@@ -2030,8 +2035,7 @@ static int name_cache_insert(struct send_ctx *sctx,
int ret = 0;
struct list_head *nce_head;
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
@@ -2040,14 +2044,14 @@ static int name_cache_insert(struct send_ctx *sctx,
}
INIT_LIST_HEAD(nce_head);
- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+ ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
if (ret < 0) {
kfree(nce_head);
kfree(nce);
return ret;
}
}
- list_add_tail(&nce->radix_list, nce_head);
+ list_add_tail(&nce->inum_aliases, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@@ -2059,15 +2063,14 @@ static void name_cache_delete(struct send_ctx *sctx,
{
struct list_head *nce_head;
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
btrfs_err(sctx->send_root->fs_info,
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
nce->ino, sctx->name_cache_size);
}
- list_del(&nce->radix_list);
+ list_del(&nce->inum_aliases);
list_del(&nce->list);
sctx->name_cache_size--;
@@ -2075,7 +2078,7 @@ static void name_cache_delete(struct send_ctx *sctx,
* We may not get to the final release of nce_head if the lookup fails
*/
if (nce_head && list_empty(nce_head)) {
- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+ xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
}
@@ -2086,11 +2089,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
struct list_head *nce_head;
struct name_cache_entry *cur;
- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
if (!nce_head)
return NULL;
- list_for_each_entry(cur, nce_head, radix_list) {
+ list_for_each_entry(cur, nce_head, inum_aliases) {
if (cur->ino == ino && cur->gen == gen)
return cur;
}
@@ -2675,61 +2678,43 @@ out:
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
- struct extent_buffer *eb;
struct btrfs_dir_item *di;
- int slot;
path = alloc_path_for_send();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->send_root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
+ btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *eb = path->nodes[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
- goto out;
+ break;
}
-
- path->slots[0]++;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -2933,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
u64 send_progress)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2959,23 +2945,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (odi)
key.offset = odi->last_dir_index_offset;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid ||
found_key.type != key.type)
break;
@@ -3010,8 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
ret = 0;
goto out;
}
-
- path->slots[0]++;
+ }
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
free_orphan_dir_info(sctx, odi);
@@ -3579,7 +3553,7 @@ static int check_ino_in_path(struct btrfs_root *root,
}
/*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
* possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
@@ -3591,6 +3565,7 @@ static int is_ancestor(struct btrfs_root *root,
{
bool free_fs_path = false;
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -3611,26 +3586,12 @@ static int is_ancestor(struct btrfs_root *root,
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (true) {
+ btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino2)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
@@ -3668,10 +3629,12 @@ static int is_ancestor(struct btrfs_root *root,
if (ret)
goto out;
}
- path->slots[0]++;
}
ret = 0;
- out:
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+out:
btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
@@ -4551,13 +4514,12 @@ out:
static int process_all_refs(struct send_ctx *sctx,
enum btrfs_compare_tree_result cmd)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
iterate_inode_ref_t cb;
int pending_move = 0;
@@ -4581,24 +4543,7 @@ static int process_all_refs(struct send_ctx *sctx,
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY))
@@ -4607,8 +4552,11 @@ static int process_all_refs(struct send_ctx *sctx,
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
btrfs_release_path(path);
@@ -4870,13 +4818,12 @@ out:
static int process_all_new_xattrs(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -4887,39 +4834,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -4946,7 +4875,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
struct page *page;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
@@ -4957,40 +4885,33 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
- /* initial readahead */
- memset(&sctx->ra, 0, sizeof(struct file_ra_state));
- file_ra_state_init(&sctx->ra, inode->i_mapping);
-
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(inode->i_mapping, index);
+ page = find_lock_page(sctx->cur_inode->i_mapping, index);
if (!page) {
- page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
- NULL, index, last_index + 1 - index);
+ page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, index,
+ last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- GFP_KERNEL);
+ page = find_or_create_page(sctx->cur_inode->i_mapping,
+ index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
}
}
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping, &sctx->ra,
- NULL, page, index, last_index + 1 - index);
- }
+ if (PageReadahead(page))
+ page_cache_async_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, page_folio(page),
+ index, last_index + 1 - index);
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
@@ -5013,7 +4934,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
len -= cur_len;
sctx->send_size += cur_len;
}
- iput(inode);
+
return ret;
}
@@ -5220,12 +5141,49 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ const u64 end = offset + len;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ if (sctx->cur_inode == NULL) {
+ struct btrfs_root *root = sctx->send_root;
+
+ sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(sctx->cur_inode)) {
+ int err = PTR_ERR(sctx->cur_inode);
+
+ sctx->cur_inode = NULL;
+ return err;
+ }
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+ /*
+ * It's very likely there are no pages from this inode in the page
+ * cache, so after reading extents and sending their data, we clean
+ * the page cache to avoid trashing the page cache (adding pressure
+ * to the page cache and forcing eviction of other data more useful
+ * for applications).
+ *
+ * We decide if we should clean the page cache simply by checking
+ * if the inode's mapping nrpages is 0 when we first open it, and
+ * not by using something like filemap_range_has_page() before
+ * reading an extent because when we ask the readahead code to
+ * read a given file range, it may (and almost always does) read
+ * pages from beyond that range (see the documentation for
+ * page_cache_sync_readahead()), so it would not be reliable,
+ * because after reading the first extent future calls to
+ * filemap_range_has_page() would return true because the readahead
+ * on the previous extent resulted in reading pages of the current
+ * extent as well.
+ */
+ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+ sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
+ }
+
while (sent < len) {
u64 size = min(len - sent, read_size);
int ret;
@@ -5235,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx,
return ret;
sent += size;
}
+
+ if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ /*
+ * Always operate only on ranges that are a multiple of the page
+ * size. This is not only to prevent zeroing parts of a page in
+ * the case of subpage sector size, but also to guarantee we evict
+ * pages, as passing a range that is smaller than page size does
+ * not evict the respective page (only zeroes part of its content).
+ *
+ * Always start from the end offset of the last range cleared.
+ * This is because the readahead code may (and very often does)
+ * reads pages beyond the range we request for readahead. So if
+ * we have an extent layout like this:
+ *
+ * [ extent A ] [ extent B ] [ extent C ]
+ *
+ * When we ask page_cache_sync_readahead() to read extent A, it
+ * may also trigger reads for pages of extent B. If we are doing
+ * an incremental send and extent B has not changed between the
+ * parent and send snapshots, some or all of its pages may end
+ * up being read and placed in the page cache. So when truncating
+ * the page cache we always start from the end offset of the
+ * previously processed extent up to the end of the current
+ * extent.
+ */
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ end - 1);
+ sctx->page_cache_clear_start = end;
+ }
+
return 0;
}
@@ -5965,13 +5954,12 @@ out:
static int process_all_extents(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
root = sctx->send_root;
path = alloc_path_for_send();
@@ -5981,41 +5969,21 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = process_extent(sctx, path, &found_key);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -6205,8 +6173,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
{
LIST_HEAD(deleted_refs);
struct btrfs_path *path;
+ struct btrfs_root *root = sctx->parent_root;
struct btrfs_key key;
+ struct btrfs_key found_key;
struct parent_paths_ctx ctx;
+ int iter_ret = 0;
int ret;
path = alloc_path_for_send();
@@ -6216,39 +6187,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
key.objectid = sctx->cur_ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
ctx.refs = &deleted_refs;
ctx.sctx = sctx;
- while (true) {
- struct extent_buffer *eb = path->nodes[0];
- int slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->parent_root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != sctx->cur_ino)
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ if (found_key.objectid != key.objectid)
break;
- if (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)
+ if (found_key.type != key.type &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
- ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
+ ret = iterate_inode_ref(root, path, &found_key, 1,
record_parent_ref, &ctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
while (!list_empty(&deleted_refs)) {
@@ -6270,6 +6228,30 @@ out:
return ret;
}
+static void close_current_inode(struct send_ctx *sctx)
+{
+ u64 i_size;
+
+ if (sctx->cur_inode == NULL)
+ return;
+
+ i_size = i_size_read(sctx->cur_inode);
+
+ /*
+ * If we are doing an incremental send, we may have extents between the
+ * last processed extent and the i_size that have not been processed
+ * because they haven't changed but we may have read some of their pages
+ * through readahead, see the comments at send_extent_data().
+ */
+ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ round_up(i_size, PAGE_SIZE) - 1);
+
+ iput(sctx->cur_inode);
+ sctx->cur_inode = NULL;
+}
+
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -6280,6 +6262,8 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
+ close_current_inode(sctx);
+
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
@@ -7534,7 +7518,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
+ xa_init_flags(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
@@ -7766,6 +7750,8 @@ out:
name_cache_free(sctx);
+ close_current_inode(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index b87931a458eb..2dd8754cb990 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
@@ -519,7 +528,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
- trans = (struct btrfs_trans_handle *)current->journal_info;
+ trans = current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index d841fed73492..c096695598c1 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include "volumes.h"
+
struct btrfs_space_info {
spinlock_t lock;
@@ -24,6 +26,12 @@ struct btrfs_space_info {
the space info if we had an ENOSPC in the
allocator. */
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+ * schedule it for reclaim.
+ */
+ int bg_reclaim_threshold;
+
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index ef7ae20d2b77..a105b291444f 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,6 +63,29 @@
* This means a slightly higher tree locking latency.
*/
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+{
+ if (fs_info->sectorsize >= PAGE_SIZE)
+ return false;
+
+ /*
+ * Only data pages (either through DIO or compression) can have no
+ * mapping. And if page->mapping->host is data inode, it's subpage.
+ * As we have ruled our sectorsize >= PAGE_SIZE case already.
+ */
+ if (!page->mapping || !page->mapping->host ||
+ is_data_inode(page->mapping->host))
+ return true;
+
+ /*
+ * Now the only remaining case is metadata, which we only go subpage
+ * routine if nodesize < PAGE_SIZE.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return true;
+ return false;
+}
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
{
unsigned int cur = 0;
@@ -107,7 +130,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(PageLocked(page));
/* Either not subpage, or the page already has private attached */
- if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
@@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/* Either not subpage, or already detached */
- if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
return;
- subpage = (struct btrfs_subpage *)detach_page_private(page);
+ subpage = detach_page_private(page);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
lock_page(page);
return 0;
}
@@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
return unlock_page(page);
btrfs_subpage_clamp_range(page, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
@@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
} \
void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
btrfs_subpage_clamp_range(page, &start, &len); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
@@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(!PageDirty(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
struct btrfs_subpage *subpage;
ASSERT(PageLocked(page));
- /* For regular page size case, we just unlock the page */
- if (fs_info->sectorsize == PAGE_SIZE)
+ /* For non-subpage case, we just unlock the page */
+ if (!btrfs_is_subpage(fs_info, page))
return unlock_page(page);
ASSERT(PagePrivate(page) && page->private);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 7accb5c40d33..0e80ad336904 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -74,6 +74,8 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b228efe8ab6e..b1fdc6a26c76 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -261,7 +261,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -292,10 +292,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
- printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
- printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
@@ -1903,6 +1903,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
@@ -1912,8 +1913,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
- new_pool_size);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 17389a42a3ab..92a1fa8e3da6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -394,11 +394,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* 4K sector size is also supported with 64K page size */
- if (PAGE_SIZE == SZ_64K)
+ /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
-
- /* Only sectorsize == PAGE_SIZE is now supported */
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
@@ -722,6 +720,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
+
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+
+ return ret;
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh < 0 || thresh > 100)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
+
/*
* Allocation information about block group types.
*
@@ -738,6 +772,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -922,6 +957,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
case BTRFS_EXCLOP_BALANCE:
str = "balance\n";
break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
case BTRFS_EXCLOP_DEV_ADD:
str = "device add\n";
break;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d8e56edd6991..1591bfa55bcc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- struct radix_tree_iter iter;
- void **slot;
+ unsigned long index;
+ struct extent_buffer *eb;
struct btrfs_device *dev, *tmp;
if (!fs_info)
@@ -163,25 +163,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- spin_lock(&fs_info->buffer_lock);
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
- struct extent_buffer *eb;
-
- eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
- if (!eb)
- continue;
- /* Shouldn't happen but that kind of thinking creates CVE's */
- if (radix_tree_exception(eb)) {
- if (radix_tree_deref_retry(eb))
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock(&fs_info->buffer_lock);
+ xa_for_each(&fs_info->extent_buffers, index, eb) {
free_extent_buffer_stale(eb);
- spin_lock(&fs_info->buffer_lock);
}
- spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@@ -202,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
- if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+ if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state)))
return;
btrfs_global_root_delete(root);
btrfs_put_root(root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b008c5110958..06c0a958d114 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,7 +23,7 @@
#include "space-info.h"
#include "zoned.h"
-#define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_TRANS_TAG XA_MARK_0
/*
* Transaction states and transitions
@@ -221,7 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
* the caching thread will re-start it's search from 3, and thus find
* the hole from [4,6) to add to the free space cache.
*/
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
struct btrfs_block_group *cache = caching_ctl->block_group;
@@ -234,7 +234,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
cache->last_byte_to_unpin = caching_ctl->progress;
}
}
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
- spin_lock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
if (root->last_trans == trans->transid && !force) {
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return 0;
}
- radix_tree_tag_set(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ xa_set_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@@ -487,11 +487,9 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
- spin_lock(&fs_info->fs_roots_radix_lock);
- radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ xa_clear_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
@@ -1404,9 +1402,8 @@ void btrfs_add_dead_root(struct btrfs_root *root)
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *gang[8];
- int i;
- int ret;
+ struct btrfs_root *root;
+ unsigned long index;
/*
* At this point no one can be using this transaction to modify any tree
@@ -1414,57 +1411,46 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
*/
ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
- spin_lock(&fs_info->fs_roots_radix_lock);
- while (1) {
- ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang),
- BTRFS_ROOT_TRANS_TAG);
- if (ret == 0)
- break;
- for (i = 0; i < ret; i++) {
- struct btrfs_root *root = gang[i];
- int ret2;
-
- /*
- * At this point we can neither have tasks logging inodes
- * from a root nor trying to commit a log tree.
- */
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
-
- radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
-
- btrfs_free_log(trans, root);
- ret2 = btrfs_update_reloc_root(trans, root);
- if (ret2)
- return ret2;
-
- /* see comments in should_cow_block() */
- clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_mb__after_atomic();
-
- if (root->commit_root != root->node) {
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
- btrfs_set_root_node(&root->root_item,
- root->node);
- }
+ spin_lock(&fs_info->fs_roots_lock);
+ xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
+ int ret;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
+ xa_clear_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_lock);
- ret2 = btrfs_update_root(trans, fs_info->tree_root,
- &root->root_key,
- &root->root_item);
- if (ret2)
- return ret2;
- spin_lock(&fs_info->fs_roots_radix_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
+ btrfs_free_log(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
+
+ /* See comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item, root->node);
}
+
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret)
+ return ret;
+ spin_lock(&fs_info->fs_roots_lock);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return 0;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e56c0107eea3..9e0e0ae2288c 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1855,3 +1855,58 @@ out:
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
+
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
+{
+ const bool is_subvol = is_fstree(root_owner);
+ const u64 eb_owner = btrfs_header_owner(eb);
+
+ /*
+ * Skip dummy fs, as selftests don't create unique ebs for each dummy
+ * root.
+ */
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ return 0;
+ /*
+ * There are several call sites (backref walking, qgroup, and data
+ * reloc) passing 0 as @root_owner, as they are not holding the
+ * tree root. In that case, we can not do a reliable ownership check,
+ * so just exit.
+ */
+ if (root_owner == 0)
+ return 0;
+ /*
+ * These trees use key.offset as their owner, our callers don't have
+ * the extra capacity to pass key.offset here. So we just skip them.
+ */
+ if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
+ root_owner == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!is_subvol) {
+ /* For non-subvolume trees, the eb owner should match root owner */
+ if (unlikely(root_owner != eb_owner)) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ root_owner);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /*
+ * For subvolume trees, owners can mismatch, but they should all belong
+ * to subvolume trees.
+ */
+ if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 32fecc9dc1dd..ece497e26558 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 09e4f1a04e6f..370388fadf96 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -894,8 +894,7 @@ update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
@@ -2575,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2786,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen,
+ ret = btrfs_read_extent_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
@@ -2815,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -3721,11 +3720,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
@@ -3849,13 +3866,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ret = insert_dir_log_key(trans, log, dst_path,
ino, *last_old_dentry_offset + 1,
key.offset - 1);
- /*
- * -EEXIST should never happen because when we
- * log a directory in full mode (LOG_INODE_ALL)
- * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
- * the log tree.
- */
- ASSERT(ret != -EEXIST);
if (ret < 0)
return ret;
}
@@ -5805,6 +5815,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* Before logging the inode item, cache the value returned by
* inode_logged(), because after that we have the need to figure out if
* the inode was previously logged in this transaction.
@@ -6182,7 +6204,7 @@ again:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
@@ -7019,12 +7041,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
- * still has a dentry in the old directory. So take the old
- * directory's log_mutex to prevent getting an -EEXIST when
- * logging a key to record the deletion, or having that other
- * task logging the old directory get an -EEXIST if it attempts
- * to log the same key after we just did it. In both cases that
- * would result in falling back to a transaction commit.
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
*/
mutex_lock(&old_dir->log_mutex);
ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8cc736731fd..9c20049d1fec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
@@ -405,7 +393,6 @@ void btrfs_free_device(struct btrfs_device *device)
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
- bio_put(device->flush_bio);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -643,7 +630,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -2706,7 +2693,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -4063,13 +4050,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
- if (fs_info->sectorsize < PAGE_SIZE &&
- bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sectorsize %u with page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- return false;
- }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -6313,7 +6293,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
- u64 stripe_len;
+ u32 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
@@ -6324,19 +6304,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe where this block falls in */
- stripe_nr = div64_u64(offset, stripe_len);
- /* Offset of stripe in the chunk */
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
-"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
- stripe_offset, offset, em->start, logical, stripe_len);
- return -EINVAL;
- }
+ /*
+ * Stripe_nr is where this block falls in
+ * stripe_offset is the offset of this block in its stripe.
+ */
+ stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+ ASSERT(stripe_offset < U32_MAX);
- /* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
/* Only stripe based profiles needs to check against stripe length. */
@@ -6738,11 +6712,11 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
- bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
}
static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
@@ -6824,10 +6798,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
continue;
}
- if (dev_nr < total_devs - 1)
- bio = btrfs_bio_clone(first_bio);
- else
+ if (dev_nr < total_devs - 1) {
+ bio = btrfs_bio_clone(dev->bdev, first_bio);
+ } else {
bio = first_bio;
+ bio_set_dev(bio, dev->bdev);
+ }
submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
@@ -6949,16 +6925,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (!dev)
return ERR_PTR(-ENOMEM);
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
@@ -7370,7 +7336,6 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@@ -7386,30 +7351,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
/*
- * This will create extent buffer of nodesize, superblock size is
- * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
- * overallocate but we can keep it as-is, only the first page is used.
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
- root->root_key.objectid, 0);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
set_extent_buffer_uptodate(sb);
- /*
- * The sb extent buffer is artificial and just used to read the system array.
- * set_extent_buffer_uptodate() call does not properly mark all it's
- * pages up-to-date when the page is larger: extent does not cover the
- * whole page and consequently check_page_uptodate does not find all
- * the page's extents up-to-date (the hole beyond sb),
- * write_extent_buffer then triggers a WARN_ON.
- *
- * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
- * but sb spans only this function. Add an explicit SetPageUptodate call
- * to silence the warning eg. on PowerPC 64.
- */
- if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -7572,6 +7523,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
int ret;
int slot;
+ int iter_ret = 0;
u64 total_dev = 0;
u64 last_ra_node = 0;
@@ -7615,30 +7567,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- struct extent_buffer *node;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
leaf = path->nodes[0];
slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- break;
- }
- node = path->nodes[1];
+
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@@ -7663,7 +7603,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto error;
}
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
}
/*
@@ -7671,12 +7615,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@@ -8288,7 +8232,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
static int relocating_repair_kthread(void *data)
{
- struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_block_group *cache = data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f3e28f11cfb6..6721002000ee 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,17 +17,51 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
+/* Used by sanity check for btrfs_raid_types. */
+#define const_ffs(n) (__builtin_ctzll(n) + 1)
+
+/*
+ * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+ * RAID0 always to be the lowest profile bit.
+ * Although it's part of on-disk format and should never change, do extra
+ * compile-time sanity checks.
+ */
+static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+ const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+ ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+
+/* ilog2() can handle both constants and variables */
+#define BTRFS_BG_FLAG_TO_INDEX(profile) \
+ ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+
+enum btrfs_raid_types {
+ /* SINGLE is the special one as it doesn't have on-disk bit. */
+ BTRFS_RAID_SINGLE = 0,
+
+ BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+ BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+ BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+ BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+ BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+ BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+ BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+ BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+
+ BTRFS_NR_RAID_TYPES
+};
+
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
- u64 stripe_len;
+ u32 stripe_len;
+ /* offset of address in stripe */
+ u32 stripe_offset;
/* number of stripe where address falls */
u64 stripe_nr;
- /* offset of address in stripe */
- u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
@@ -121,8 +155,8 @@ struct btrfs_device {
/* bytes used on the current transaction */
u64 commit_bytes_used;
- /* for sending down flush barriers */
- struct bio *flush_bio;
+ /* Bio used for flushing device barriers */
+ struct bio flush_bio;
struct completion flush_wait;
/* per-device scrub information */
@@ -430,7 +464,7 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
- u64 stripe_len;
+ u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 99abf41b89b9..7421abcf325a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (start_trans)
btrfs_end_transaction(trans);
@@ -271,10 +272,12 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct btrfs_key found_key;
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
+ int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -293,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path->reada = READA_FORWARD;
/* search for our xattrs */
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
- struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
- /* this is where we start walking through the path */
- if (slot >= btrfs_header_nritems(leaf)) {
- /*
- * if we've reached the last slot in this leaf we need
- * to go to the next leaf and reset everything
- */
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
- goto next_item;
+ continue;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
item_size = btrfs_item_size(leaf, slot);
@@ -350,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
- ret = -ERANGE;
- goto err;
+ iter_ret = -ERANGE;
+ break;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
@@ -363,12 +345,13 @@ next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
-next_item:
- path->slots[0]++;
}
- ret = total_size;
-err:
+ if (iter_ret < 0)
+ ret = iter_ret;
+ else
+ ret = total_size;
+
btrfs_free_path(path);
return ret;
@@ -403,10 +386,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
- ret = btrfs_validate_prop(name, value, size);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
if (ret)
return ret;
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -416,7 +402,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1b1b310c3c51..11237a913bee 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -51,11 +51,13 @@
#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
/*
- * Maximum supported zone size. Currently, SMR disks have a zone size of
- * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
- * expect the zone size to become larger than 8GiB in the near future.
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define BTRFS_MIN_ZONE_SIZE SZ_4M
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
@@ -350,7 +352,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- struct request_queue *queue = bdev_get_queue(bdev);
unsigned int max_active_zones;
unsigned int nactive;
sector_t nr_sectors;
@@ -402,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
+ } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu smaller than supported minimum %u",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
}
nr_sectors = bdev_nr_sectors(bdev);
@@ -410,7 +418,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = queue_max_active_zones(queue);
+ max_active_zones = bdev_max_active_zones(bdev);
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err_in_rcu(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -1835,6 +1843,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
+ /* No space left */
+ if (btrfs_zoned_bg_is_full(block_group)) {
+ ret = false;
+ goto out_unlock;
+ }
+
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
@@ -1842,35 +1856,23 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
if (device->zone_info->max_active_zones == 0)
continue;
- /* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
- ret = false;
- goto out_unlock;
- }
-
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
goto out_unlock;
}
-
- /* Successfully activated all the zones */
- if (i == map->num_stripes - 1)
- block_group->zone_is_active = 1;
-
-
}
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
spin_unlock(&block_group->lock);
- if (block_group->zone_is_active) {
- /* For the active block group list */
- btrfs_get_block_group(block_group);
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&block_group->active_bg_list,
- &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return true;
@@ -1879,20 +1881,14 @@ out_unlock:
return ret;
}
-int btrfs_zone_finish(struct btrfs_block_group *block_group)
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ bool need_zone_finish;
int ret = 0;
int i;
- if (!btrfs_is_zoned(fs_info))
- return 0;
-
- map = block_group->physical_map;
-
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
spin_unlock(&block_group->lock);
@@ -1902,40 +1898,56 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
/* Check if we have unwritten allocated space */
if ((block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
- block_group->alloc_offset > block_group->meta_write_pointer) {
+ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
spin_unlock(&block_group->lock);
return -EAGAIN;
}
- spin_unlock(&block_group->lock);
-
- ret = btrfs_inc_block_group_ro(block_group, false);
- if (ret)
- return ret;
-
- /* Ensure all writes in this block group finish */
- btrfs_wait_block_group_reservations(block_group);
- /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
-
- spin_lock(&block_group->lock);
/*
- * Bail out if someone already deactivated the block group, or
- * allocated space is left in the block group.
+ * If we are sure that the block group is full (= no more room left for
+ * new allocation) and the IO for the last usable block is completed, we
+ * don't need to wait for the other IOs. This holds because we ensure
+ * the sequential IO submissions using the ZONE_APPEND command for data
+ * and block_group->meta_write_pointer for metadata.
*/
- if (!block_group->zone_is_active) {
+ if (!fully_written) {
spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return 0;
- }
- if (block_group->reserved) {
- spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return -EAGAIN;
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
}
+ /*
+ * The block group is not fully allocated, so not fully written yet. We
+ * need to send ZONE_FINISH command to free up an active zone.
+ */
+ need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
+
block_group->zone_is_active = 0;
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
@@ -1943,24 +1955,29 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
+ map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 physical = map->stripes[i].physical;
if (device->zone_info->max_active_zones == 0)
continue;
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ if (need_zone_finish) {
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- if (ret)
- return ret;
+ if (ret)
+ return ret;
+ }
btrfs_dev_clear_active_zone(device, physical);
}
- btrfs_dec_block_group_ro(block_group);
+
+ if (!fully_written)
+ btrfs_dec_block_group_ro(block_group);
spin_lock(&fs_info->zone_active_bgs_lock);
ASSERT(!list_empty(&block_group->active_bg_list));
@@ -1973,6 +1990,14 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
return 0;
}
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return 0;
+
+ return do_zone_finish(block_group, false);
+}
+
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
struct btrfs_fs_info *fs_info = fs_devices->fs_info;
@@ -2004,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
- struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
return;
@@ -2014,42 +2037,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
block_group = btrfs_lookup_block_group(fs_info, logical);
ASSERT(block_group);
- if (logical + length < block_group->start + block_group->zone_capacity)
- goto out;
-
- spin_lock(&block_group->lock);
+ /* No MIXED_BG on zoned btrfs. */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ min_alloc_bytes = fs_info->sectorsize;
+ else
+ min_alloc_bytes = fs_info->nodesize;
- if (!block_group->zone_is_active) {
- spin_unlock(&block_group->lock);
+ /* Bail out if we can allocate more data from this block group. */
+ if (logical + length + min_alloc_bytes <=
+ block_group->start + block_group->zone_capacity)
goto out;
- }
- block_group->zone_is_active = 0;
- /* We should have consumed all the free space */
- ASSERT(block_group->alloc_offset == block_group->zone_capacity);
- ASSERT(block_group->free_space_ctl->free_space == 0);
- btrfs_clear_treelog_bg(block_group);
- btrfs_clear_data_reloc_bg(block_group);
- spin_unlock(&block_group->lock);
+ do_zone_finish(block_group, true);
- map = block_group->physical_map;
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
+out:
+ btrfs_put_block_group(block_group);
+}
- if (!device->zone_info->max_active_zones)
- goto out;
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+ struct btrfs_block_group *bg =
+ container_of(work, struct btrfs_block_group, zone_finish_work);
- btrfs_dev_clear_active_zone(device, physical);
+ wait_on_extent_buffer_writeback(bg->last_eb);
+ free_extent_buffer(bg->last_eb);
+ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ btrfs_put_block_group(bg);
+}
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(!list_empty(&block_group->active_bg_list));
- list_del_init(&block_group->active_bg_list);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb)
+{
+ if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ return;
- btrfs_put_block_group(block_group);
+ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+ bg->start);
+ return;
+ }
-out:
- btrfs_put_block_group(block_group);
+ /* For the work */
+ btrfs_get_block_group(bg);
+ atomic_inc(&eb->refs);
+ bg->last_eb = eb;
+ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+ queue_work(system_unbound_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2079,3 +2112,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+
+ if (fs_info->bg_reclaim_threshold == 0)
+ return false;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ factor = div64_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6dee76248cb4..bb1a189e11f9 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -10,11 +10,7 @@
#include "block-group.h"
#include "btrfs_inode.h"
-/*
- * Block groups with more than this value (percents) of unusable space will be
- * scheduled for background reclaim.
- */
-#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
struct btrfs_zoned_device_info {
/*
@@ -76,8 +72,11 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -233,9 +232,17 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
+static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb) { }
+
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -370,4 +377,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
+static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
+{
+ ASSERT(btrfs_is_zoned(bg->fs_info));
+ return (bg->alloc_offset == bg->zone_capacity);
+}
+
#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index fc42dd0badd7..0fe31a6f6e68 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/*
- * zstd_reclaim_timer_fn - reclaim timer
+
+/**
+ * Timer callback to free unused workspaces.
+ *
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ *
+ * The context is softirq and does not need the _bh locking primitives.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock_bh(&wsm.lock);
+ spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
return;
}
@@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
}
/*
diff --git a/fs/buffer.c b/fs/buffer.c
index 2b5561ae5d0b..898c7f301b1b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -79,26 +79,26 @@ void unlock_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(unlock_buffer);
/*
- * Returns if the page has dirty or writeback buffers. If all the buffers
- * are unlocked and clean then the PageDirty information is stale. If
- * any of the pages are locked, it is assumed they are locked for IO.
+ * Returns if the folio has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the folio_test_dirty information is stale. If
+ * any of the buffers are locked, it is assumed they are locked for IO.
*/
-void buffer_check_dirty_writeback(struct page *page,
+void buffer_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct buffer_head *head, *bh;
*dirty = false;
*writeback = false;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
*writeback = true;
- head = page_buffers(page);
bh = head;
do {
if (buffer_locked(bh))
@@ -314,7 +314,7 @@ static void decrypt_bh(struct work_struct *work)
}
/*
- * I/O completion handler for block_read_full_page() - pages
+ * I/O completion handler for block_read_full_folio() - pages
* which come unlocked at the end of I/O.
*/
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
@@ -955,7 +955,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
size);
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page_folio(page)))
goto failed;
}
@@ -1060,8 +1060,8 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* Also. When blockdev buffers are explicitly read with bread(), they
* individually become uptodate. But their backing page remains not
* uptodate - even if all of its buffers are uptodate. A subsequent
- * block_read_full_page() against that page will discover all the uptodate
- * buffers, will set the page uptodate and will perform no I/O.
+ * block_read_full_folio() against that folio will discover all the uptodate
+ * buffers, will set the folio uptodate and will perform no I/O.
*/
/**
@@ -2088,7 +2088,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
/*
* If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus readpage() for
+ * uptodate then we can optimize away a bogus read_folio() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
@@ -2104,13 +2104,13 @@ static int __block_commit_write(struct inode *inode, struct page *page,
* The filesystem needs to handle block truncation upon failure.
*/
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
- unsigned flags, struct page **pagep, get_block_t *get_block)
+ struct page **pagep, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -2137,12 +2137,12 @@ int block_write_end(struct file *file, struct address_space *mapping,
if (unlikely(copied < len)) {
/*
- * The buffers that were written will now be uptodate, so we
- * don't have to worry about a readpage reading them and
- * overwriting a partial write. However if we have encountered
- * a short write and only partially written into a buffer, it
- * will not be marked uptodate, so a readpage might come in and
- * destroy our partial write.
+ * The buffers that were written will now be uptodate, so
+ * we don't have to worry about a read_folio reading them
+ * and overwriting a partial write. However if we have
+ * encountered a short write and only partially written
+ * into a buffer, it will not be marked uptodate, so a
+ * read_folio might come in and destroy our partial write.
*
* Do the simplest thing, and just treat any short write to a
* non uptodate page as a zero-length write, and force the
@@ -2245,26 +2245,28 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
EXPORT_SYMBOL(block_is_partially_uptodate);
/*
- * Generic "read page" function for block devices that have the normal
+ * Generic "read_folio" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
- * Reads the page asynchronously --- the unlock_buffer() and
+ * Reads the folio asynchronously --- the unlock_buffer() and
* set/clear_buffer_uptodate() functions propagate buffer state into the
- * page struct once IO has completed.
+ * folio once IO has completed.
*/
-int block_read_full_page(struct page *page, get_block_t *get_block)
+int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
- head = create_page_buffers(page, inode, 0);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+
+ head = create_page_buffers(&folio->page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
+ iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
@@ -2282,10 +2284,11 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
if (err)
- SetPageError(page);
+ folio_set_error(folio);
}
if (!buffer_mapped(bh)) {
- zero_user(page, i * blocksize, blocksize);
+ folio_zero_range(folio, i * blocksize,
+ blocksize);
if (!err)
set_buffer_uptodate(bh);
continue;
@@ -2301,16 +2304,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
if (!nr) {
/*
- * All buffers are uptodate - we can set the page uptodate
+ * All buffers are uptodate - we can set the folio uptodate
* as well. But not if get_block() returned an error.
*/
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
+ if (!folio_test_error(folio))
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
@@ -2335,7 +2338,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
}
return 0;
}
-EXPORT_SYMBOL(block_read_full_page);
+EXPORT_SYMBOL(block_read_full_folio);
/* utility function for filesystems that need to do work on expanding
* truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2344,6 +2347,7 @@ EXPORT_SYMBOL(block_read_full_page);
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
struct page *page;
void *fsdata;
int err;
@@ -2352,11 +2356,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
if (err)
goto out;
- err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+ err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
if (err)
goto out;
- err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+ err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
BUG_ON(err > 0);
out:
@@ -2368,6 +2372,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
+ const struct address_space_operations *aops = mapping->a_ops;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
void *fsdata;
@@ -2387,12 +2392,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ err = aops->write_begin(file, mapping, curpos, len,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
- err = pagecache_write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
@@ -2420,12 +2425,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ err = aops->write_begin(file, mapping, curpos, len,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
- err = pagecache_write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
@@ -2441,7 +2446,7 @@ out:
* We may have to extend the file.
*/
int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata,
get_block_t *get_block, loff_t *bytes)
{
@@ -2460,7 +2465,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, flags, pagep, get_block);
+ return block_write_begin(mapping, pos, len, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);
@@ -2568,8 +2573,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
* On exit the page is fully uptodate in the areas outside (from,to)
* The filesystem needs to handle block truncation upon failure.
*/
-int nobh_write_begin(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct page **pagep, void **fsdata,
get_block_t *get_block)
{
@@ -2591,7 +2595,7 @@ int nobh_write_begin(struct address_space *mapping,
from = pos & (PAGE_SIZE - 1);
to = from + len;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -2790,44 +2794,28 @@ int nobh_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize;
- sector_t iblock;
- unsigned length, pos;
struct inode *inode = mapping->host;
- struct page *page;
+ unsigned blocksize = i_blocksize(inode);
+ struct folio *folio;
struct buffer_head map_bh;
+ size_t offset;
+ sector_t iblock;
int err;
- blocksize = i_blocksize(inode);
- length = offset & (blocksize - 1);
-
/* Block boundary? Nothing to do */
- if (!length)
+ if (!(from & (blocksize - 1)))
return 0;
- length = blocksize - length;
- iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
-
- page = grab_cache_page(mapping, index);
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT,
+ mapping_gfp_mask(mapping));
err = -ENOMEM;
- if (!page)
+ if (!folio)
goto out;
- if (page_has_buffers(page)) {
-has_buffers:
- unlock_page(page);
- put_page(page);
- return block_truncate_page(mapping, from, get_block);
- }
-
- /* Find the buffer that contains "offset" */
- pos = blocksize;
- while (offset >= pos) {
- iblock++;
- pos += blocksize;
- }
+ if (folio_buffers(folio))
+ goto has_buffers;
+ iblock = from >> inode->i_blkbits;
map_bh.b_size = blocksize;
map_bh.b_state = 0;
err = get_block(inode, iblock, &map_bh, 0);
@@ -2838,29 +2826,35 @@ has_buffers:
goto unlock;
/* Ok, it's mapped. Make sure it's up-to-date */
- if (!PageUptodate(page)) {
- err = mapping->a_ops->readpage(NULL, page);
+ if (!folio_test_uptodate(folio)) {
+ err = mapping->a_ops->read_folio(NULL, folio);
if (err) {
- put_page(page);
+ folio_put(folio);
goto out;
}
- lock_page(page);
- if (!PageUptodate(page)) {
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
err = -EIO;
goto unlock;
}
- if (page_has_buffers(page))
+ if (folio_buffers(folio))
goto has_buffers;
}
- zero_user(page, offset, length);
- set_page_dirty(page);
+ offset = offset_in_folio(folio, from);
+ folio_zero_segment(folio, offset, round_up(offset, blocksize));
+ folio_mark_dirty(folio);
err = 0;
unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
return err;
+
+has_buffers:
+ folio_unlock(folio);
+ folio_put(folio);
+ return block_truncate_page(mapping, from, get_block);
}
EXPORT_SYMBOL(nobh_truncate_page);
@@ -3161,20 +3155,20 @@ int sync_dirty_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(sync_dirty_buffer);
/*
- * try_to_free_buffers() checks if all the buffers on this particular page
+ * try_to_free_buffers() checks if all the buffers on this particular folio
* are unused, and releases them if so.
*
* Exclusion against try_to_free_buffers may be obtained by either
- * locking the page or by holding its mapping's private_lock.
+ * locking the folio or by holding its mapping's private_lock.
*
- * If the page is dirty but all the buffers are clean then we need to
- * be sure to mark the page clean as well. This is because the page
+ * If the folio is dirty but all the buffers are clean then we need to
+ * be sure to mark the folio clean as well. This is because the folio
* may be against a block device, and a later reattachment of buffers
- * to a dirty page will set *all* buffers dirty. Which would corrupt
+ * to a dirty folio will set *all* buffers dirty. Which would corrupt
* filesystem data on the same device.
*
- * The same applies to regular filesystem pages: if all the buffers are
- * clean then we set the page clean and proceed. To do that, we require
+ * The same applies to regular filesystem folios: if all the buffers are
+ * clean then we set the folio clean and proceed. To do that, we require
* total exclusion from block_dirty_folio(). That is obtained with
* private_lock.
*
@@ -3186,10 +3180,10 @@ static inline int buffer_busy(struct buffer_head *bh)
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
-static int
-drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
+static bool
+drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
- struct buffer_head *head = page_buffers(page);
+ struct buffer_head *head = folio_buffers(folio);
struct buffer_head *bh;
bh = head;
@@ -3207,46 +3201,46 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = next;
} while (bh != head);
*buffers_to_free = head;
- detach_page_private(page);
- return 1;
+ folio_detach_private(folio);
+ return true;
failed:
- return 0;
+ return false;
}
-int try_to_free_buffers(struct page *page)
+bool try_to_free_buffers(struct folio *folio)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = folio->mapping;
struct buffer_head *buffers_to_free = NULL;
- int ret = 0;
+ bool ret = 0;
- BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
- return 0;
+ BUG_ON(!folio_test_locked(folio));
+ if (folio_test_writeback(folio))
+ return false;
if (mapping == NULL) { /* can this still happen? */
- ret = drop_buffers(page, &buffers_to_free);
+ ret = drop_buffers(folio, &buffers_to_free);
goto out;
}
spin_lock(&mapping->private_lock);
- ret = drop_buffers(page, &buffers_to_free);
+ ret = drop_buffers(folio, &buffers_to_free);
/*
* If the filesystem writes its buffers by hand (eg ext3)
- * then we can have clean buffers against a dirty page. We
- * clean the page here; otherwise the VM will never notice
+ * then we can have clean buffers against a dirty folio. We
+ * clean the folio here; otherwise the VM will never notice
* that the filesystem did any IO at all.
*
* Also, during truncate, discard_buffer will have marked all
- * the page's buffers clean. We discover that here and clean
- * the page also.
+ * the folio's buffers clean. We discover that here and clean
+ * the folio also.
*
* private_lock must be held over this entire operation in order
* to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
- cancel_dirty_page(page);
+ folio_cancel_dirty(folio);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 719faeeda168..8df715640a48 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -26,3 +26,15 @@ config CACHEFILES_ERROR_INJECTION
help
This permits error injection to be enabled in cachefiles whilst a
cache is in service.
+
+config CACHEFILES_ONDEMAND
+ bool "Support for on-demand read"
+ depends on CACHEFILES
+ default n
+ help
+ This permits userspace to enable the cachefiles on-demand read mode.
+ In this mode, when a cache miss occurs, responsibility for fetching
+ the data lies with the cachefiles backend instead of with the netfs
+ and is delegated to userspace.
+
+ If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 16d811f1a2fa..c37a7a9af10b 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -16,5 +16,6 @@ cachefiles-y := \
xattr.o
cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 7ac04ee2c0a0..aa4efcabb5e3 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -75,6 +75,9 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
{ "inuse", cachefiles_daemon_inuse },
{ "secctx", cachefiles_daemon_secctx },
{ "tag", cachefiles_daemon_tag },
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ { "copen", cachefiles_ondemand_copen },
+#endif
{ "", NULL }
};
@@ -108,6 +111,9 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
INIT_LIST_HEAD(&cache->volumes);
INIT_LIST_HEAD(&cache->object_list);
spin_lock_init(&cache->object_list_lock);
+ refcount_set(&cache->unbind_pincount, 1);
+ xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC);
+ xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1);
/* set default caching limits
* - limit at 1% free space and/or free files
@@ -126,6 +132,53 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
+static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+ struct xarray *xa = &cache->reqs;
+ struct cachefiles_req *req;
+ unsigned long index;
+
+ /*
+ * Make sure the following two operations won't be reordered.
+ * 1) set CACHEFILES_DEAD bit
+ * 2) flush requests in the xarray
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * flush requests in the xarray
+ * test CACHEFILES_DEAD bit
+ * enqueue the request
+ * set CACHEFILES_DEAD bit
+ */
+ smp_mb();
+
+ xa_lock(xa);
+ xa_for_each(xa, index, req) {
+ req->error = -EIO;
+ complete(&req->done);
+ }
+ xa_unlock(xa);
+
+ xa_destroy(&cache->reqs);
+ xa_destroy(&cache->ondemand_ids);
+}
+
+void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache)
+{
+ if (refcount_dec_and_test(&cache->unbind_pincount)) {
+ cachefiles_daemon_unbind(cache);
+ cachefiles_open = 0;
+ kfree(cache);
+ }
+}
+
+void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache)
+{
+ refcount_inc(&cache->unbind_pincount);
+}
+
/*
* Release a cache.
*/
@@ -139,36 +192,27 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
set_bit(CACHEFILES_DEAD, &cache->flags);
- cachefiles_daemon_unbind(cache);
+ if (cachefiles_in_ondemand_mode(cache))
+ cachefiles_flush_reqs(cache);
/* clean up the control file interface */
cache->cachefilesd = NULL;
file->private_data = NULL;
- cachefiles_open = 0;
- kfree(cache);
+ cachefiles_put_unbind_pincount(cache);
_leave("");
return 0;
}
-/*
- * Read the cache state.
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
- size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
{
- struct cachefiles_cache *cache = file->private_data;
unsigned long long b_released;
unsigned f_released;
char buffer[256];
int n;
- //_enter(",,%zu,", buflen);
-
- if (!test_bit(CACHEFILES_READY, &cache->flags))
- return 0;
-
/* check how much space the cache has */
cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
@@ -207,6 +251,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
}
/*
+ * Read the cache state.
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+ size_t buflen, loff_t *pos)
+{
+ struct cachefiles_cache *cache = file->private_data;
+
+ //_enter(",,%zu,", buflen);
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags))
+ return 0;
+
+ if (cachefiles_in_ondemand_mode(cache))
+ return cachefiles_ondemand_daemon_read(cache, _buffer, buflen);
+ else
+ return cachefiles_do_daemon_read(cache, _buffer, buflen);
+}
+
+/*
* Take a command from cachefilesd, parse it and act on it.
*/
static ssize_t cachefiles_daemon_write(struct file *file,
@@ -297,8 +360,13 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
poll_wait(file, &cache->daemon_pollwq, poll);
mask = 0;
- if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
- mask |= EPOLLIN;
+ if (cachefiles_in_ondemand_mode(cache)) {
+ if (!xa_empty(&cache->reqs))
+ mask |= EPOLLIN;
+ } else {
+ if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+ mask |= EPOLLIN;
+ }
if (test_bit(CACHEFILES_CULLING, &cache->flags))
mask |= EPOLLOUT;
@@ -687,11 +755,6 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent >= 100)
return -ERANGE;
- if (*args) {
- pr_err("'bind' command doesn't take an argument\n");
- return -EINVAL;
- }
-
if (!cache->rootdirname) {
pr_err("No cache directory specified\n");
return -EINVAL;
@@ -703,6 +766,18 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
return -EBUSY;
}
+ if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) {
+ if (!strcmp(args, "ondemand")) {
+ set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+ } else if (*args) {
+ pr_err("Invalid argument to the 'bind' command\n");
+ return -EINVAL;
+ }
+ } else if (*args) {
+ pr_err("'bind' command doesn't take an argument\n");
+ return -EINVAL;
+ }
+
/* Make sure we have copies of the tag string */
if (!cache->tag) {
/*
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ae93cee9d25d..a69073a1d3f0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -362,6 +362,8 @@ static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
spin_unlock(&cache->object_list_lock);
}
+ cachefiles_ondemand_clean_object(object);
+
if (object->file) {
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_clean_up_object(object, cache);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index c793d33b0224..6cba2c6de2f9 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -15,6 +15,8 @@
#include <linux/fscache-cache.h>
#include <linux/cred.h>
#include <linux/security.h>
+#include <linux/xarray.h>
+#include <linux/cachefiles.h>
#define CACHEFILES_DIO_BLOCK_SIZE 4096
@@ -58,8 +60,13 @@ struct cachefiles_object {
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ int ondemand_id;
+#endif
};
+#define CACHEFILES_ONDEMAND_ID_CLOSED -1
+
/*
* Cache files cache definition
*/
@@ -98,11 +105,31 @@ struct cachefiles_cache {
#define CACHEFILES_DEAD 1 /* T if cache dead */
#define CACHEFILES_CULLING 2 /* T if cull engaged */
#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */
char *rootdirname; /* name of cache root directory */
char *secctx; /* LSM security context */
char *tag; /* cache binding tag */
+ refcount_t unbind_pincount;/* refcount to do daemon unbind */
+ struct xarray reqs; /* xarray of pending on-demand requests */
+ struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
+ u32 ondemand_id_next;
+};
+
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+ return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+ test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
+
+struct cachefiles_req {
+ struct cachefiles_object *object;
+ struct completion done;
+ int error;
+ struct cachefiles_msg msg;
};
+#define CACHEFILES_REQ_NEW XA_MARK_1
+
#include <trace/events/cachefiles.h>
static inline
@@ -145,6 +172,8 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
+extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
/*
* error_inject.c
@@ -201,6 +230,16 @@ extern void cachefiles_put_object(struct cachefiles_object *object,
*/
extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
+extern int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet);
+extern int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv);
/*
* key.c
@@ -241,6 +280,45 @@ extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
struct cachefiles_object *object);
/*
+ * ondemand.c
+ */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+ char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
+
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len);
+
+#else
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ return 0;
+}
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
+
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/*
* security.c
*/
extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 9dc81e781f2b..000a28f46e59 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -277,36 +277,33 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
/*
* Initiate a write to the cache.
*/
-static int cachefiles_write(struct netfs_cache_resources *cres,
- loff_t start_pos,
- struct iov_iter *iter,
- netfs_io_terminated_t term_func,
- void *term_func_priv)
+int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
{
- struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
struct inode *inode;
- struct file *file;
unsigned int old_nofs;
- ssize_t ret = -ENOBUFS;
+ ssize_t ret;
size_t len = iov_iter_count(iter);
- if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
- goto presubmission_error;
fscache_count_write();
- object = cachefiles_cres_object(cres);
cache = object->volume->cache;
- file = cachefiles_cres_file(cres);
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
i_size_read(file_inode(file)));
- ret = -ENOMEM;
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
- if (!ki)
- goto presubmission_error;
+ if (!ki) {
+ if (term_func)
+ term_func(term_func_priv, -ENOMEM, false);
+ return -ENOMEM;
+ }
refcount_set(&ki->ki_refcnt, 2);
ki->iocb.ki_filp = file;
@@ -314,7 +311,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
ki->iocb.ki_ioprio = get_current_ioprio();
ki->object = object;
- ki->inval_counter = cres->inval_counter;
ki->start = start_pos;
ki->len = len;
ki->term_func = term_func;
@@ -369,11 +365,24 @@ in_progress:
cachefiles_put_kiocb(ki);
_leave(" = %zd", ret);
return ret;
+}
-presubmission_error:
- if (term_func)
- term_func(term_func_priv, ret, false);
- return ret;
+static int cachefiles_write(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
+ if (term_func)
+ term_func(term_func_priv, -ENOBUFS, false);
+ return -ENOBUFS;
+ }
+
+ return __cachefiles_write(cachefiles_cres_object(cres),
+ cachefiles_cres_file(cres),
+ start_pos, iter,
+ term_func, term_func_priv);
}
/*
@@ -394,6 +403,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
+ int rc;
_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
@@ -406,7 +416,8 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
why = cachefiles_trace_read_no_data;
- goto out_no_object;
+ if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ goto out_no_object;
}
/* The object and the file may be being created in the background. */
@@ -423,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
object = cachefiles_cres_object(cres);
cache = object->volume->cache;
cachefiles_begin_secure(cache, &saved_cred);
-
+retry:
off = cachefiles_inject_read_error();
if (off == 0)
off = vfs_llseek(file, subreq->start, SEEK_DATA);
@@ -474,6 +485,15 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
download_and_store:
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
+ rc = cachefiles_ondemand_read(object, subreq->start,
+ subreq->len);
+ if (!rc) {
+ __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ goto retry;
+ }
+ ret = NETFS_INVALID_READ;
+ }
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
@@ -484,13 +504,12 @@ out_no_object:
/*
* Prepare for a write to occur.
*/
-static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet)
{
- struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = cachefiles_cres_file(cres);
loff_t start = *_start, pos;
size_t len = *_len, down;
int ret;
@@ -577,7 +596,8 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
}
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
+ ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+ _start, _len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca9f3e4ec4b3..facf2ebe464b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -452,10 +452,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
struct file *file;
struct path path;
- uint64_t ni_size = object->cookie->object_size;
+ uint64_t ni_size;
long ret;
- ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
cachefiles_begin_secure(cache, &saved_cred);
@@ -481,6 +480,15 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
goto out_dput;
}
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0) {
+ file = ERR_PTR(ret);
+ goto out_unuse;
+ }
+
+ ni_size = object->cookie->object_size;
+ ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
+
if (ni_size > 0) {
trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
cachefiles_trunc_expand_tmpfile);
@@ -586,6 +594,10 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
}
_debug("file -> %pd positive", dentry);
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0)
+ goto error_fput;
+
ret = cachefiles_check_auxdata(object, file);
if (ret < 0)
goto check_failed;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 000000000000..a41ae6efc545
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/fdtable.h>
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+ struct file *file)
+{
+ struct cachefiles_object *object = file->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ int object_id = object->ondemand_id;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+
+ xa_lock(&cache->reqs);
+ object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+
+ /*
+ * Flush all pending READ requests since their completion depends on
+ * anon_fd.
+ */
+ xas_for_each(&xas, req, ULONG_MAX) {
+ if (req->msg.opcode == CACHEFILES_OP_READ) {
+ req->error = -EIO;
+ complete(&req->done);
+ xas_store(&xas, NULL);
+ }
+ }
+ xa_unlock(&cache->reqs);
+
+ xa_erase(&cache->ondemand_ids, object_id);
+ trace_cachefiles_ondemand_fd_release(object, object_id);
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ cachefiles_put_unbind_pincount(cache);
+ return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+ struct iov_iter *iter)
+{
+ struct cachefiles_object *object = kiocb->ki_filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct file *file = object->file;
+ size_t len = iter->count;
+ loff_t pos = kiocb->ki_pos;
+ const struct cred *saved_cred;
+ int ret;
+
+ if (!file)
+ return -ENOBUFS;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ cachefiles_end_secure(cache, saved_cred);
+ if (ret < 0)
+ return ret;
+
+ trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
+ ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
+ int whence)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct file *file = object->file;
+
+ if (!file)
+ return -ENOBUFS;
+
+ return vfs_llseek(file, pos, whence);
+}
+
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ unsigned long id;
+
+ if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+ return -EINVAL;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ id = arg;
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ trace_cachefiles_ondemand_cread(object, id);
+ complete(&req->done);
+ return 0;
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+ .owner = THIS_MODULE,
+ .release = cachefiles_ondemand_fd_release,
+ .write_iter = cachefiles_ondemand_fd_write_iter,
+ .llseek = cachefiles_ondemand_fd_llseek,
+ .unlocked_ioctl = cachefiles_ondemand_fd_ioctl,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ * <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+ struct cachefiles_req *req;
+ struct fscache_cookie *cookie;
+ char *pid, *psize;
+ unsigned long id;
+ long size;
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ if (!*args) {
+ pr_err("Empty id specified\n");
+ return -EINVAL;
+ }
+
+ pid = args;
+ psize = strchr(args, ',');
+ if (!psize) {
+ pr_err("Cache size is not specified\n");
+ return -EINVAL;
+ }
+
+ *psize = 0;
+ psize++;
+
+ ret = kstrtoul(pid, 0, &id);
+ if (ret)
+ return ret;
+
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ /* fail OPEN request if copen format is invalid */
+ ret = kstrtol(psize, 0, &size);
+ if (ret) {
+ req->error = ret;
+ goto out;
+ }
+
+ /* fail OPEN request if daemon reports an error */
+ if (size < 0) {
+ if (!IS_ERR_VALUE(size))
+ size = -EINVAL;
+ req->error = size;
+ goto out;
+ }
+
+ cookie = req->object->cookie;
+ cookie->object_size = size;
+ if (size)
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ else
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ trace_cachefiles_ondemand_copen(req->object, id, size);
+
+out:
+ complete(&req->done);
+ return ret;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct cachefiles_open *load;
+ struct file *file;
+ u32 object_id;
+ int ret, fd;
+
+ object = cachefiles_grab_object(req->object,
+ cachefiles_obj_get_ondemand_fd);
+ cache = object->volume->cache;
+
+ ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL,
+ XA_LIMIT(1, INT_MAX),
+ &cache->ondemand_id_next, GFP_KERNEL);
+ if (ret < 0)
+ goto err;
+
+ fd = get_unused_fd_flags(O_WRONLY);
+ if (fd < 0) {
+ ret = fd;
+ goto err_free_id;
+ }
+
+ file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
+ object, O_WRONLY);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+ fd_install(fd, file);
+
+ load = (void *)req->msg.data;
+ load->fd = fd;
+ req->msg.object_id = object_id;
+ object->ondemand_id = object_id;
+
+ cachefiles_get_unbind_pincount(cache);
+ trace_cachefiles_ondemand_open(object, &req->msg, load);
+ return 0;
+
+err_put_fd:
+ put_unused_fd(fd);
+err_free_id:
+ xa_erase(&cache->ondemand_ids, object_id);
+err:
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ return ret;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ struct cachefiles_req *req;
+ struct cachefiles_msg *msg;
+ unsigned long id = 0;
+ size_t n;
+ int ret = 0;
+ XA_STATE(xas, &cache->reqs, 0);
+
+ /*
+ * Search for a request that has not ever been processed, to prevent
+ * requests from being processed repeatedly.
+ */
+ xa_lock(&cache->reqs);
+ req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ if (!req) {
+ xa_unlock(&cache->reqs);
+ return 0;
+ }
+
+ msg = &req->msg;
+ n = msg->len;
+
+ if (n > buflen) {
+ xa_unlock(&cache->reqs);
+ return -EMSGSIZE;
+ }
+
+ xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+ xa_unlock(&cache->reqs);
+
+ id = xas.xa_index;
+ msg->msg_id = id;
+
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ ret = cachefiles_ondemand_get_fd(req);
+ if (ret)
+ goto error;
+ }
+
+ if (copy_to_user(_buffer, msg, n) != 0) {
+ ret = -EFAULT;
+ goto err_put_fd;
+ }
+
+ /* CLOSE request has no reply */
+ if (msg->opcode == CACHEFILES_OP_CLOSE) {
+ xa_erase(&cache->reqs, id);
+ complete(&req->done);
+ }
+
+ return n;
+
+err_put_fd:
+ if (msg->opcode == CACHEFILES_OP_OPEN)
+ close_fd(((struct cachefiles_open *)msg->data)->fd);
+error:
+ xa_erase(&cache->reqs, id);
+ req->error = ret;
+ complete(&req->done);
+ return ret;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+ enum cachefiles_opcode opcode,
+ size_t data_len,
+ init_req_fn init_req,
+ void *private)
+{
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return 0;
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags))
+ return -EIO;
+
+ req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->object = object;
+ init_completion(&req->done);
+ req->msg.opcode = opcode;
+ req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+ ret = init_req(req, private);
+ if (ret)
+ goto out;
+
+ do {
+ /*
+ * Stop enqueuing the request when daemon is dying. The
+ * following two operations need to be atomic as a whole.
+ * 1) check cache state, and
+ * 2) enqueue request if cache is alive.
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * test CACHEFILES_DEAD bit
+ * set CACHEFILES_DEAD bit
+ * flush requests in the xarray
+ * enqueue the request
+ */
+ xas_lock(&xas);
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* coupled with the barrier in cachefiles_flush_reqs() */
+ smp_mb();
+
+ if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
+ WARN_ON_ONCE(object->ondemand_id == 0);
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ xas.xa_index = 0;
+ xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ xas_set_err(&xas, -EBUSY);
+ xas_store(&xas, req);
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ ret = xas_error(&xas);
+ if (ret)
+ goto out;
+
+ wake_up_all(&cache->daemon_pollwq);
+ wait_for_completion(&req->done);
+ ret = req->error;
+out:
+ kfree(req);
+ return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ struct cachefiles_open *load = (void *)req->msg.data;
+ size_t volume_key_size, cookie_key_size;
+ void *volume_key, *cookie_key;
+
+ /*
+ * Volume key is a NUL-terminated string. key[0] stores strlen() of the
+ * string, followed by the content of the string (excluding '\0').
+ */
+ volume_key_size = volume->key[0] + 1;
+ volume_key = volume->key + 1;
+
+ /* Cookie key is binary data, which is netfs specific. */
+ cookie_key_size = cookie->key_len;
+ cookie_key = fscache_get_key(cookie);
+
+ if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) {
+ pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n");
+ return -EINVAL;
+ }
+
+ load->volume_key_size = volume_key_size;
+ load->cookie_key_size = cookie_key_size;
+ memcpy(load->data, volume_key, volume_key_size);
+ memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+
+ return 0;
+}
+
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ int object_id = object->ondemand_id;
+
+ /*
+ * It's possible that object id is still 0 if the cookie looking up
+ * phase failed before OPEN request has ever been sent. Also avoid
+ * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
+ * anon_fd has already been closed.
+ */
+ if (object_id <= 0)
+ return -ENOENT;
+
+ req->msg.object_id = object_id;
+ trace_cachefiles_ondemand_close(object, &req->msg);
+ return 0;
+}
+
+struct cachefiles_read_ctx {
+ loff_t off;
+ size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct cachefiles_read *load = (void *)req->msg.data;
+ struct cachefiles_read_ctx *read_ctx = private;
+ int object_id = object->ondemand_id;
+
+ /* Stop enqueuing requests when daemon has closed anon_fd. */
+ if (object_id <= 0) {
+ WARN_ON_ONCE(object_id == 0);
+ pr_info_once("READ: anonymous fd closed prematurely.\n");
+ return -EIO;
+ }
+
+ req->msg.object_id = object_id;
+ load->off = read_ctx->off;
+ load->len = read_ctx->len;
+ trace_cachefiles_ondemand_read(object, &req->msg, load);
+ return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ size_t volume_key_size, cookie_key_size, data_len;
+
+ /*
+ * CacheFiles will firstly check the cache file under the root cache
+ * directory. If the coherency check failed, it will fallback to
+ * creating a new tmpfile as the cache file. Reuse the previously
+ * allocated object ID if any.
+ */
+ if (object->ondemand_id > 0)
+ return 0;
+
+ volume_key_size = volume->key[0] + 1;
+ cookie_key_size = cookie->key_len;
+ data_len = sizeof(struct cachefiles_open) +
+ volume_key_size + cookie_key_size;
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+ data_len, cachefiles_ondemand_init_open_req, NULL);
+}
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+ cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+ cachefiles_ondemand_init_close_req, NULL);
+}
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ struct cachefiles_read_ctx read_ctx = {pos, len};
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+ sizeof(struct cachefiles_read),
+ cachefiles_ondemand_init_read_req, &read_ctx);
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa25bffd4823..7584aa6e5025 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
if (folio_test_dirty(folio)) {
dout("%p dirty_folio %p idx %lu -- already dirty\n",
mapping->host, folio, folio->index);
- BUG_ON(!folio_get_private(folio));
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
* Reference snap context in folio->private. Also set
* PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(folio_get_private(folio));
+ VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
folio_attach_private(folio, snapc);
return ceph_fscache_dirty_folio(mapping, folio);
@@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
}
WARN_ON(!folio_test_locked(folio));
- if (folio_get_private(folio)) {
+ if (folio_test_private(folio)) {
dout("%p invalidate_folio idx %lu full dirty page\n",
inode, folio->index);
@@ -162,24 +162,24 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
folio_wait_fscache(folio);
}
-static int ceph_releasepage(struct page *page, gfp_t gfp)
+static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
- dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n",
- ceph_vinop(inode), page,
- page->index, PageDirty(page) ? "" : "not ");
+ dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
+ ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
- if (PagePrivate(page))
- return 0;
+ if (folio_test_private(folio))
+ return false;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
- return 0;
- wait_on_page_fscache(page);
+ return false;
+ folio_wait_fscache(folio);
}
ceph_fscache_note_page_release(inode);
- return 1;
+ return true;
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
@@ -1311,14 +1314,14 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned aop_flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
struct folio *folio = NULL;
int r;
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL);
+ r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL);
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1372,7 +1375,7 @@ out:
}
const struct address_space_operations ceph_aops = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
@@ -1380,7 +1383,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .releasepage = ceph_releasepage,
+ .release_folio = ceph_release_folio,
.direct_IO = noop_direct_IO,
};
@@ -1772,7 +1775,7 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ceph_vmops;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f1ad6884d4da..5c14ef04e474 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2274,6 +2274,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2294,6 +2296,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -3870,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3933,6 +3938,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3958,6 +3964,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6c9e837aa1d3..8c8226c0feac 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa38c013126d..00c3de177dd6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4434,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4444,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d511a78383c3..580a847aa8b5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4612,8 +4612,9 @@ read_complete:
return rc;
}
-static int cifs_readpage(struct file *file, struct page *page)
+static int cifs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
loff_t offset = page_file_offset(page);
int rc = -EACCES;
unsigned int xid;
@@ -4626,7 +4627,7 @@ static int cifs_readpage(struct file *file, struct page *page)
return rc;
}
- cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
+ cifs_dbg(FYI, "read_folio %p at offset %d 0x%x\n",
page, (int)offset, (int)offset);
rc = cifs_readpage_worker(file, page, &offset);
@@ -4681,7 +4682,7 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
}
static int cifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int oncethru = 0;
@@ -4695,7 +4696,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
start:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page) {
rc = -ENOMEM;
goto out;
@@ -4757,16 +4758,16 @@ out:
return rc;
}
-static int cifs_release_page(struct page *page, gfp_t gfp)
+static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
{
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
- fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
+ fscache_note_page_release(cifs_inode_cookie(folio->mapping->host));
return true;
}
@@ -4905,6 +4906,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
cifs_dbg(FYI, "swap activate\n");
+ if (!swap_file->f_mapping->a_ops->swap_rw)
+ /* Cannot support swap */
+ return -EINVAL;
+
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
@@ -4933,7 +4938,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
* from reading or writing the file
*/
- return 0;
+ sis->flags |= SWP_FS_OPS;
+ return add_swap_extent(sis, 0, sis->max, 0);
}
static void cifs_swap_deactivate(struct file *file)
@@ -4965,14 +4971,14 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
#endif
const struct address_space_operations cifs_addr_ops = {
- .readpage = cifs_readpage,
+ .read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.dirty_folio = cifs_dirty_folio,
- .releasepage = cifs_release_page,
+ .release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
@@ -4986,18 +4992,18 @@ const struct address_space_operations cifs_addr_ops = {
};
/*
- * cifs_readpages requires the server to support a buffer large enough to
+ * cifs_readahead requires the server to support a buffer large enough to
* contain the header plus one complete page of data. Otherwise, we need
- * to leave cifs_readpages out of the address space operations.
+ * to leave cifs_readahead out of the address space operations.
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
- .readpage = cifs_readpage,
+ .read_folio = cifs_read_folio,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.dirty_folio = cifs_dirty_folio,
- .releasepage = cifs_release_page,
+ .release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
};
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 31ef64eb7fbb..b3a1265711cc 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -649,7 +649,7 @@ static int smbd_ia_open(
smbd_max_frmr_depth,
info->id->device->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
- if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
info->pd = ib_alloc_pd(info->id->device, 0);
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 8907d0508198..8adf81042498 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -20,9 +20,10 @@
#include "coda_psdev.h"
#include "coda_linux.h"
-static int coda_symlink_filler(struct file *file, struct page *page)
+static int coda_symlink_filler(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
@@ -44,5 +45,5 @@ fail:
}
const struct address_space_operations coda_symlink_aops = {
- .readpage = coda_symlink_filler,
+ .read_folio = coda_symlink_filler,
};
diff --git a/fs/cramfs/README b/fs/cramfs/README
index d71b27e0ff15..778df5c4d70b 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -115,7 +115,7 @@ Block Size
(Block size in cramfs refers to the size of input data that is
compressed at a time. It's intended to be somewhere around
-PAGE_SIZE for cramfs_readpage's convenience.)
+PAGE_SIZE for cramfs_read_folio's convenience.)
The superblock ought to indicate the block size that the fs was
written for, since comments in <linux/pagemap.h> indicate that
@@ -161,7 +161,7 @@ size. The options are:
PAGE_SIZE.
It's easy enough to change the kernel to use a smaller value than
-PAGE_SIZE: just make cramfs_readpage read multiple blocks.
+PAGE_SIZE: just make cramfs_read_folio read multiple blocks.
The cost of option 1 is that kernels with a larger PAGE_SIZE
value don't get as good compression as they can.
@@ -173,9 +173,9 @@ they don't mind their cramfs being inaccessible to kernels with
smaller PAGE_SIZE values.
Option 3 is easy to implement if we don't mind being CPU-inefficient:
-e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
+e.g. get read_folio to decompress to a buffer of size MAX_BLKSIZE (which
must be no larger than 32KB) and discard what it doesn't need.
-Getting readpage to read into all the covered pages is harder.
+Getting read_folio to read into all the covered pages is harder.
The main advantage of option 3 over 1, 2, is better compression. The
cost is greater complexity. Probably not worth it, but I hope someone
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 666aa380011e..7ae59a6afc5c 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -414,7 +414,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
/*
* Let's create a mixed map if we can't map it all.
* The normal paging machinery will take care of the
- * unpopulated ptes via cramfs_readpage().
+ * unpopulated ptes via cramfs_read_folio().
*/
int i;
vma->vm_flags |= VM_MIXEDMAP;
@@ -814,8 +814,9 @@ out:
return d_splice_alias(inode, dentry);
}
-static int cramfs_readpage(struct file *file, struct page *page)
+static int cramfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
u32 maxblock;
int bytes_filled;
@@ -925,7 +926,7 @@ err:
}
static const struct address_space_operations cramfs_aops = {
- .readpage = cramfs_readpage
+ .read_folio = cramfs_read_folio
};
/*
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 526a4c1bed99..e78be66bbf01 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -113,7 +113,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
if (WARN_ON_ONCE(len <= 0))
return -EINVAL;
- if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0))
+ if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
return -EINVAL;
fscrypt_generate_iv(&iv, lblk_num, ci);
@@ -213,8 +213,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place
* @inode: The inode to which this block belongs
* @page: The page containing the block to encrypt
- * @len: Size of block to encrypt. Doesn't need to be a multiple of the
- * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @len: Size of block to encrypt. This must be a multiple of
+ * FSCRYPT_CONTENTS_ALIGNMENT.
* @offs: Byte offset within @page at which the block to encrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
@@ -283,8 +283,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
* fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
* @inode: The inode to which this block belongs
* @page: The page containing the block to decrypt
- * @len: Size of block to decrypt. Doesn't need to be a multiple of the
- * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @len: Size of block to decrypt. This must be a multiple of
+ * FSCRYPT_CONTENTS_ALIGNMENT.
* @offs: Byte offset within @page at which the block to decrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index a9be4bc74a94..14e0ef5e9a20 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -19,6 +19,13 @@
#include "fscrypt_private.h"
/*
+ * The minimum message length (input and output length), in bytes, for all
+ * filenames encryption modes. Filenames shorter than this will be zero-padded
+ * before being encrypted.
+ */
+#define FSCRYPT_FNAME_MIN_MSG_LEN 16
+
+/*
* struct fscrypt_nokey_name - identifier for directory entry when key is absent
*
* When userspace lists an encrypted directory without access to the key, the
@@ -267,7 +274,7 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
if (orig_len > max_len)
return false;
- encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE);
+ encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN);
encrypted_len = round_up(encrypted_len, padding);
*encrypted_len_ret = min(encrypted_len, max_len);
return true;
@@ -350,7 +357,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
return 0;
}
- if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+ if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN)
return -EUCLEAN;
if (fscrypt_has_encryption_key(inode))
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 5b0a9e6478b5..6b4c8094cc7b 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -545,8 +545,8 @@ struct key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
-int fscrypt_add_test_dummy_key(struct super_block *sb,
- struct fscrypt_key_specifier *key_spec);
+int fscrypt_get_test_dummy_key_identifier(
+ u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
int fscrypt_verify_key_added(struct super_block *sb,
const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
@@ -561,7 +561,9 @@ struct fscrypt_mode {
int keysize; /* key size in bytes */
int security_strength; /* security strength in bytes */
int ivsize; /* IV size in bytes */
- int logged_impl_name;
+ int logged_cryptoapi_impl;
+ int logged_blk_crypto_native;
+ int logged_blk_crypto_fallback;
enum blk_crypto_mode_num blk_crypto_mode;
};
@@ -621,6 +623,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
const union fscrypt_policy *policy2);
+int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
+ struct fscrypt_key_specifier *key_spec);
bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
const struct inode *inode);
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 93c2ca858092..90f3e68f166e 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
* provides the key and IV to use.
*/
-#include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
@@ -64,6 +64,35 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
return DIV_ROUND_UP(lblk_bits, 8);
}
+/*
+ * Log a message when starting to use blk-crypto (native) or blk-crypto-fallback
+ * for an encryption mode for the first time. This is the blk-crypto
+ * counterpart to the message logged when starting to use the crypto API for the
+ * first time. A limitation is that these messages don't convey which specific
+ * filesystems or files are using each implementation. However, *usually*
+ * systems use just one implementation per mode, which makes these messages
+ * helpful for debugging problems where the "wrong" implementation is used.
+ */
+static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
+ struct request_queue **devs,
+ int num_devs,
+ const struct blk_crypto_config *cfg)
+{
+ int i;
+
+ for (i = 0; i < num_devs; i++) {
+ if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
+ __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) {
+ if (!xchg(&mode->logged_blk_crypto_native, 1))
+ pr_info("fscrypt: %s using blk-crypto (native)\n",
+ mode->friendly_name);
+ } else if (!xchg(&mode->logged_blk_crypto_fallback, 1)) {
+ pr_info("fscrypt: %s using blk-crypto-fallback\n",
+ mode->friendly_name);
+ }
+ }
+}
+
/* Enable inline encryption for this file if supported. */
int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
{
@@ -117,6 +146,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
goto out_free_devs;
}
+ fscrypt_log_blk_crypto_impl(ci->ci_mode, devs, num_devs, &crypto_cfg);
+
ci->ci_inlinecrypt = true;
out_free_devs:
kfree(devs);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 0b3ffbb4faf4..caee9f8620dd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -688,28 +688,68 @@ out_wipe_secret:
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
-/*
- * Add the key for '-o test_dummy_encryption' to the filesystem keyring.
- *
- * Use a per-boot random key to prevent people from misusing this option.
- */
-int fscrypt_add_test_dummy_key(struct super_block *sb,
- struct fscrypt_key_specifier *key_spec)
+static void
+fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
{
static u8 test_key[FSCRYPT_MAX_KEY_SIZE];
+
+ get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+
+ memset(secret, 0, sizeof(*secret));
+ secret->size = FSCRYPT_MAX_KEY_SIZE;
+ memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+}
+
+int fscrypt_get_test_dummy_key_identifier(
+ u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
struct fscrypt_master_key_secret secret;
int err;
- get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+ fscrypt_get_test_dummy_secret(&secret);
- memset(&secret, 0, sizeof(secret));
- secret.size = FSCRYPT_MAX_KEY_SIZE;
- memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+ err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+ if (err)
+ goto out;
+ err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER,
+ NULL, 0, key_identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+out:
+ wipe_master_key_secret(&secret);
+ return err;
+}
+
+/**
+ * fscrypt_add_test_dummy_key() - add the test dummy encryption key
+ * @sb: the filesystem instance to add the key to
+ * @dummy_policy: the encryption policy for test_dummy_encryption
+ *
+ * If needed, add the key for the test_dummy_encryption mount option to the
+ * filesystem. To prevent misuse of this mount option, a per-boot random key is
+ * used instead of a hardcoded one. This makes it so that any encrypted files
+ * created using this option won't be accessible after a reboot.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_add_test_dummy_key(struct super_block *sb,
+ const struct fscrypt_dummy_policy *dummy_policy)
+{
+ const union fscrypt_policy *policy = dummy_policy->policy;
+ struct fscrypt_key_specifier key_spec;
+ struct fscrypt_master_key_secret secret;
+ int err;
- err = add_master_key(sb, &secret, key_spec);
+ if (!policy)
+ return 0;
+ err = fscrypt_policy_to_key_spec(policy, &key_spec);
+ if (err)
+ return err;
+ fscrypt_get_test_dummy_secret(&secret);
+ err = add_master_key(sb, &secret, &key_spec);
wipe_master_key_secret(&secret);
return err;
}
+EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key);
/*
* Verify that the current user has added a master key with the given identifier
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index eede186b04ce..c35711896bd4 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -94,7 +94,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
mode->cipher_str, PTR_ERR(tfm));
return tfm;
}
- if (!xchg(&mode->logged_impl_name, 1)) {
+ if (!xchg(&mode->logged_cryptoapi_impl, 1)) {
/*
* fscrypt performance can vary greatly depending on which
* crypto algorithm implementation is used. Help people debug
@@ -425,23 +425,9 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
if (err)
return err;
- switch (ci->ci_policy.version) {
- case FSCRYPT_POLICY_V1:
- mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
- memcpy(mk_spec.u.descriptor,
- ci->ci_policy.v1.master_key_descriptor,
- FSCRYPT_KEY_DESCRIPTOR_SIZE);
- break;
- case FSCRYPT_POLICY_V2:
- mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
- memcpy(mk_spec.u.identifier,
- ci->ci_policy.v2.master_key_identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- break;
- default:
- WARN_ON(1);
- return -EINVAL;
- }
+ err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
+ if (err)
+ return err;
key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
if (IS_ERR(key)) {
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ed3d623724cd..5f858cee1e3b 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,6 +10,7 @@
* Modified by Eric Biggers, 2019 for v2 policy support.
*/
+#include <linux/fs_context.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
@@ -32,6 +33,26 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
+ struct fscrypt_key_specifier *key_spec)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+ memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ return 0;
+ case FSCRYPT_POLICY_V2:
+ key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+ memcpy(key_spec->u.identifier, policy->v2.master_key_identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ return 0;
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+}
+
static const union fscrypt_policy *
fscrypt_get_dummy_policy(struct super_block *sb)
{
@@ -704,73 +725,45 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
- * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
- * @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option. May be NULL.
- * @dummy_policy: the filesystem's current dummy policy (input/output, see
- * below)
- *
- * Handle the test_dummy_encryption mount option by creating a dummy encryption
- * policy, saving it in @dummy_policy, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_policy is already set, then
- * instead validate that it matches @arg. Don't support changing it via
- * remount, as that is difficult to do safely.
+ * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option
+ * @param: the mount option
+ * @dummy_policy: (input/output) the place to write the dummy policy that will
+ * result from parsing the option. Zero-initialize this. If a policy is
+ * already set here (due to test_dummy_encryption being given multiple
+ * times), then this function will verify that the policies are the same.
*
- * Return: 0 on success (dummy policy set, or the same policy is already set);
- * -EEXIST if a different dummy policy is already set;
- * or another -errno value.
+ * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the
+ * argument conflicts with one already specified; or -ENOMEM.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
- struct fscrypt_dummy_policy *dummy_policy)
+int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- struct fscrypt_key_specifier key_spec = { 0 };
- int version;
- union fscrypt_policy *policy = NULL;
+ const char *arg = "v2";
+ union fscrypt_policy *policy;
int err;
- if (!arg)
- arg = "v2";
-
- if (!strcmp(arg, "v1")) {
- version = FSCRYPT_POLICY_V1;
- key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
- memset(key_spec.u.descriptor, 0x42,
- FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(arg, "v2")) {
- version = FSCRYPT_POLICY_V2;
- key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
- /* key_spec.u.identifier gets filled in when adding the key */
- } else {
- err = -EINVAL;
- goto out;
- }
+ if (param->type == fs_value_is_string && *param->string)
+ arg = param->string;
policy = kzalloc(sizeof(*policy), GFP_KERNEL);
- if (!policy) {
- err = -ENOMEM;
- goto out;
- }
-
- err = fscrypt_add_test_dummy_key(sb, &key_spec);
- if (err)
- goto out;
+ if (!policy)
+ return -ENOMEM;
- policy->version = version;
- switch (policy->version) {
- case FSCRYPT_POLICY_V1:
+ if (!strcmp(arg, "v1")) {
+ policy->version = FSCRYPT_POLICY_V1;
policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
+ memset(policy->v1.master_key_descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- break;
- case FSCRYPT_POLICY_V2:
+ } else if (!strcmp(arg, "v2")) {
+ policy->version = FSCRYPT_POLICY_V2;
policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- break;
- default:
- WARN_ON(1);
+ err = fscrypt_get_test_dummy_key_identifier(
+ policy->v2.master_key_identifier);
+ if (err)
+ goto out;
+ } else {
err = -EINVAL;
goto out;
}
@@ -789,6 +782,37 @@ out:
kfree(policy);
return err;
}
+EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption);
+
+/**
+ * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal
+ * @p1: the first test dummy policy (may be unset)
+ * @p2: the second test dummy policy (may be unset)
+ *
+ * Return: %true if the dummy policies are both set and equal, or both unset.
+ */
+bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
+ const struct fscrypt_dummy_policy *p2)
+{
+ if (!p1->policy && !p2->policy)
+ return true;
+ if (!p1->policy || !p2->policy)
+ return false;
+ return fscrypt_policies_equal(p1->policy, p2->policy);
+}
+EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal);
+
+/* Deprecated, do not use */
+int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
+ struct fscrypt_dummy_policy *dummy_policy)
+{
+ struct fs_parameter param = {
+ .type = fs_value_is_string,
+ .string = arg ? (char *)arg : "",
+ };
+ return fscrypt_parse_test_dummy_encryption(&param, dummy_policy) ?:
+ fscrypt_add_test_dummy_key(sb, dummy_policy);
+}
EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
/**
diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..1ac12e877f4f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#define CREATE_TRACE_POINTS
@@ -789,95 +790,12 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
}
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
- unsigned long address;
-
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
- unsigned long pfn)
-{
- struct vm_area_struct *vma;
- pte_t pte, *ptep = NULL;
- pmd_t *pmdp = NULL;
- spinlock_t *ptl;
-
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- struct mmu_notifier_range range;
- unsigned long address;
-
- cond_resched();
-
- if (!(vma->vm_flags & VM_SHARED))
- continue;
-
- address = pgoff_address(index, vma);
-
- /*
- * follow_invalidate_pte() will use the range to call
- * mmu_notifier_invalidate_range_start() on our behalf before
- * taking any lock.
- */
- if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
- &pmdp, &ptl))
- continue;
-
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
- pmd_t pmd;
-
- if (pfn != pmd_pfn(*pmdp))
- goto unlock_pmd;
- if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
- goto unlock_pmd;
-
- flush_cache_page(vma, address, pfn);
- pmd = pmdp_invalidate(vma, address, pmdp);
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_mkclean(pmd);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
- spin_unlock(ptl);
- } else {
- if (pfn != pte_pfn(*ptep))
- goto unlock_pte;
- if (!pte_dirty(*ptep) && !pte_write(*ptep))
- goto unlock_pte;
-
- flush_cache_page(vma, address, pfn);
- pte = ptep_clear_flush(vma, address, ptep);
- pte = pte_wrprotect(pte);
- pte = pte_mkclean(pte);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
- pte_unmap_unlock(ptep, ptl);
- }
-
- mmu_notifier_invalidate_range_end(&range);
- }
- i_mmap_unlock_read(mapping);
-}
-
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn, index, count;
+ unsigned long pfn, index, count, end;
long ret = 0;
+ struct vm_area_struct *vma;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -935,8 +853,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
pfn = dax_to_pfn(entry);
count = 1UL << dax_entry_order(entry);
index = xas->xa_index & ~(count - 1);
+ end = index + count - 1;
+
+ /* Walk all mappings of a given index of a file and writeprotect them */
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ pfn_mkclean_range(pfn, count, index, vma);
+ cond_resched();
+ }
+ i_mmap_unlock_read(mapping);
- dax_entry_mkclean(mapping, index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aef06e607b40..840752006f60 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1115,11 +1115,10 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-static inline ssize_t
-do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, int flags)
+ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
@@ -1334,29 +1333,6 @@ fail_dio:
kmem_cache_free(dio_cache, dio);
return retval;
}
-
-ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block,
- dio_iodone_t end_io, dio_submit_t submit_io,
- int flags)
-{
- /*
- * The block device state is needed in the end to finally
- * submit everything. Since it's likely to be cache cold
- * prefetch it here as first thing to hide some of the
- * latency.
- *
- * Attempt to prefetch the pieces we likely need later.
- */
- prefetch(&bdev->bd_disk->part_tbl);
- prefetch(bdev->bd_disk->queue);
- prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
-
- return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
- end_io, submit_io, flags);
-}
-
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index b6692f81ec83..fb1981654bb2 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -101,7 +101,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
*/
b = ls->ls_recover_buf->rc_buf;
- left = ls->ls_recover_buf->rc_header.h_length;
+ left = le16_to_cpu(ls->ls_recover_buf->rc_header.h_length);
left -= sizeof(struct dlm_rcom);
for (;;) {
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 74a9590a4dd5..776c3ed519f0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -379,15 +379,15 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
#define DLM_FIN 5
struct dlm_header {
- uint32_t h_version;
+ __le32 h_version;
union {
/* for DLM_MSG and DLM_RCOM */
- uint32_t h_lockspace;
+ __le32 h_lockspace;
/* for DLM_ACK and DLM_OPTS */
- uint32_t h_seq;
+ __le32 h_seq;
} u;
- uint32_t h_nodeid; /* nodeid of sender */
- uint16_t h_length;
+ __le32 h_nodeid; /* nodeid of sender */
+ __le16 h_length;
uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
uint8_t h_pad;
};
@@ -409,24 +409,24 @@ struct dlm_header {
struct dlm_message {
struct dlm_header m_header;
- uint32_t m_type; /* DLM_MSG_ */
- uint32_t m_nodeid;
- uint32_t m_pid;
- uint32_t m_lkid; /* lkid on sender */
- uint32_t m_remid; /* lkid on receiver */
- uint32_t m_parent_lkid;
- uint32_t m_parent_remid;
- uint32_t m_exflags;
- uint32_t m_sbflags;
- uint32_t m_flags;
- uint32_t m_lvbseq;
- uint32_t m_hash;
- int m_status;
- int m_grmode;
- int m_rqmode;
- int m_bastmode;
- int m_asts;
- int m_result; /* 0 or -EXXX */
+ __le32 m_type; /* DLM_MSG_ */
+ __le32 m_nodeid;
+ __le32 m_pid;
+ __le32 m_lkid; /* lkid on sender */
+ __le32 m_remid; /* lkid on receiver */
+ __le32 m_parent_lkid;
+ __le32 m_parent_remid;
+ __le32 m_exflags;
+ __le32 m_sbflags;
+ __le32 m_flags;
+ __le32 m_lvbseq;
+ __le32 m_hash;
+ __le32 m_status;
+ __le32 m_grmode;
+ __le32 m_rqmode;
+ __le32 m_bastmode;
+ __le32 m_asts;
+ __le32 m_result; /* 0 or -EXXX */
char m_extra[]; /* name or lvb */
};
@@ -451,18 +451,18 @@ struct dlm_message {
struct dlm_rcom {
struct dlm_header rc_header;
- uint32_t rc_type; /* DLM_RCOM_ */
- int rc_result; /* multi-purpose */
- uint64_t rc_id; /* match reply with request */
- uint64_t rc_seq; /* sender's ls_recover_seq */
- uint64_t rc_seq_reply; /* remote ls_recover_seq */
+ __le32 rc_type; /* DLM_RCOM_ */
+ __le32 rc_result; /* multi-purpose */
+ __le64 rc_id; /* match reply with request */
+ __le64 rc_seq; /* sender's ls_recover_seq */
+ __le64 rc_seq_reply; /* remote ls_recover_seq */
char rc_buf[];
};
struct dlm_opt_header {
- uint16_t t_type;
- uint16_t t_length;
- uint32_t t_pad;
+ __le16 t_type;
+ __le16 t_length;
+ __le32 t_pad;
/* need to be 8 byte aligned */
char t_value[];
};
@@ -472,8 +472,8 @@ struct dlm_opts {
struct dlm_header o_header;
uint8_t o_nextcmd;
uint8_t o_pad;
- uint16_t o_optlen;
- uint32_t o_pad2;
+ __le16 o_optlen;
+ __le32 o_pad2;
char o_opts[];
};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index bdb51d209ba2..226822f49d30 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -350,10 +350,12 @@ static void put_rsb(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
uint32_t bucket = r->res_bucket;
+ int rv;
- spin_lock(&ls->ls_rsbtbl[bucket].lock);
- kref_put(&r->res_ref, toss_rsb);
- spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ rv = kref_put_lock(&r->res_ref, toss_rsb,
+ &ls->ls_rsbtbl[bucket].lock);
+ if (rv)
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
}
void dlm_put_rsb(struct dlm_rsb *r)
@@ -602,7 +604,6 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
*/
kref_get(&r->res_ref);
- error = 0;
goto out_unlock;
@@ -880,6 +881,88 @@ static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
}
}
+static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
+ int from_nodeid, bool toss_list, unsigned int flags,
+ int *r_nodeid, int *result)
+{
+ int fix_master = (flags & DLM_LU_RECOVER_MASTER);
+ int from_master = (flags & DLM_LU_RECOVER_DIR);
+
+ if (r->res_dir_nodeid != our_nodeid) {
+ /* should not happen, but may as well fix it and carry on */
+ log_error(ls, "%s res_dir %d our %d %s", __func__,
+ r->res_dir_nodeid, our_nodeid, r->res_name);
+ r->res_dir_nodeid = our_nodeid;
+ }
+
+ if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
+ /* Recovery uses this function to set a new master when
+ * the previous master failed. Setting NEW_MASTER will
+ * force dlm_recover_masters to call recover_master on this
+ * rsb even though the res_nodeid is no longer removed.
+ */
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+
+ if (toss_list) {
+ /* I don't think we should ever find it on toss list. */
+ log_error(ls, "%s fix_master on toss", __func__);
+ dlm_dump_rsb(r);
+ }
+ }
+
+ if (from_master && (r->res_master_nodeid != from_nodeid)) {
+ /* this will happen if from_nodeid became master during
+ * a previous recovery cycle, and we aborted the previous
+ * cycle before recovering this master value
+ */
+
+ log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s",
+ __func__, from_nodeid, r->res_master_nodeid,
+ r->res_nodeid, r->res_first_lkid, r->res_name);
+
+ if (r->res_master_nodeid == our_nodeid) {
+ log_error(ls, "from_master %d our_master", from_nodeid);
+ dlm_dump_rsb(r);
+ goto ret_assign;
+ }
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ }
+
+ if (!r->res_master_nodeid) {
+ /* this will happen if recovery happens while we're looking
+ * up the master for this rsb
+ */
+
+ log_debug(ls, "%s master 0 to %d first %x %s", __func__,
+ from_nodeid, r->res_first_lkid, r->res_name);
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ }
+
+ if (!from_master && !fix_master &&
+ (r->res_master_nodeid == from_nodeid)) {
+ /* this can happen when the master sends remove, the dir node
+ * finds the rsb on the keep list and ignores the remove,
+ * and the former master sends a lookup
+ */
+
+ log_limit(ls, "%s from master %d flags %x first %x %s",
+ __func__, from_nodeid, flags, r->res_first_lkid,
+ r->res_name);
+ }
+
+ ret_assign:
+ *r_nodeid = r->res_master_nodeid;
+ if (result)
+ *result = DLM_LU_MATCH;
+}
+
/*
* We're the dir node for this res and another node wants to know the
* master nodeid. During normal operation (non recovery) this is only
@@ -914,10 +997,8 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
{
struct dlm_rsb *r = NULL;
uint32_t hash, b;
- int from_master = (flags & DLM_LU_RECOVER_DIR);
- int fix_master = (flags & DLM_LU_RECOVER_MASTER);
int our_nodeid = dlm_our_nodeid();
- int dir_nodeid, error, toss_list = 0;
+ int dir_nodeid, error;
if (len > DLM_RESNAME_MAXLEN)
return -EINVAL;
@@ -949,12 +1030,21 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
if (!error) {
/* because the rsb is active, we need to lock_rsb before
- checking/changing re_master_nodeid */
+ * checking/changing re_master_nodeid
+ */
hold_rsb(r);
spin_unlock(&ls->ls_rsbtbl[b].lock);
lock_rsb(r);
- goto found;
+
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false,
+ flags, r_nodeid, result);
+
+ /* the rsb was active */
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
}
error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
@@ -962,90 +1052,16 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
goto not_found;
/* because the rsb is inactive (on toss list), it's not refcounted
- and lock_rsb is not used, but is protected by the rsbtbl lock */
-
- toss_list = 1;
- found:
- if (r->res_dir_nodeid != our_nodeid) {
- /* should not happen, but may as well fix it and carry on */
- log_error(ls, "dlm_master_lookup res_dir %d our %d %s",
- r->res_dir_nodeid, our_nodeid, r->res_name);
- r->res_dir_nodeid = our_nodeid;
- }
-
- if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
- /* Recovery uses this function to set a new master when
- the previous master failed. Setting NEW_MASTER will
- force dlm_recover_masters to call recover_master on this
- rsb even though the res_nodeid is no longer removed. */
-
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- rsb_set_flag(r, RSB_NEW_MASTER);
-
- if (toss_list) {
- /* I don't think we should ever find it on toss list. */
- log_error(ls, "dlm_master_lookup fix_master on toss");
- dlm_dump_rsb(r);
- }
- }
-
- if (from_master && (r->res_master_nodeid != from_nodeid)) {
- /* this will happen if from_nodeid became master during
- a previous recovery cycle, and we aborted the previous
- cycle before recovering this master value */
-
- log_limit(ls, "dlm_master_lookup from_master %d "
- "master_nodeid %d res_nodeid %d first %x %s",
- from_nodeid, r->res_master_nodeid, r->res_nodeid,
- r->res_first_lkid, r->res_name);
-
- if (r->res_master_nodeid == our_nodeid) {
- log_error(ls, "from_master %d our_master", from_nodeid);
- dlm_dump_rsb(r);
- goto out_found;
- }
-
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- rsb_set_flag(r, RSB_NEW_MASTER);
- }
-
- if (!r->res_master_nodeid) {
- /* this will happen if recovery happens while we're looking
- up the master for this rsb */
-
- log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s",
- from_nodeid, r->res_first_lkid, r->res_name);
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- }
-
- if (!from_master && !fix_master &&
- (r->res_master_nodeid == from_nodeid)) {
- /* this can happen when the master sends remove, the dir node
- finds the rsb on the keep list and ignores the remove,
- and the former master sends a lookup */
+ * and lock_rsb is not used, but is protected by the rsbtbl lock
+ */
- log_limit(ls, "dlm_master_lookup from master %d flags %x "
- "first %x %s", from_nodeid, flags,
- r->res_first_lkid, r->res_name);
- }
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
+ r_nodeid, result);
- out_found:
- *r_nodeid = r->res_master_nodeid;
- if (result)
- *result = DLM_LU_MATCH;
+ r->res_toss_time = jiffies;
+ /* the rsb was inactive (on toss list) */
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
- if (toss_list) {
- r->res_toss_time = jiffies;
- /* the rsb was inactive (on toss list) */
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- } else {
- /* the rsb was active */
- unlock_rsb(r);
- put_rsb(r);
- }
return 0;
not_found:
@@ -1076,7 +1092,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
if (result)
*result = DLM_LU_ADD;
*r_nodeid = from_nodeid;
- error = 0;
out_unlock:
spin_unlock(&ls->ls_rsbtbl[b].lock);
return error;
@@ -1253,9 +1268,11 @@ static void kill_lkb(struct kref *kref)
static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
uint32_t lkid = lkb->lkb_id;
+ int rv;
- spin_lock(&ls->ls_lkbidr_spin);
- if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+ rv = kref_put_lock(&lkb->lkb_ref, kill_lkb,
+ &ls->ls_lkbidr_spin);
+ if (rv) {
idr_remove(&ls->ls_lkbidr, lkid);
spin_unlock(&ls->ls_lkbidr_spin);
@@ -1265,11 +1282,9 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
if (lkb->lkb_lvbptr && is_master_copy(lkb))
dlm_free_lvb(lkb->lkb_lvbptr);
dlm_free_lkb(lkb);
- return 1;
- } else {
- spin_unlock(&ls->ls_lkbidr_spin);
- return 0;
}
+
+ return rv;
}
int dlm_put_lkb(struct dlm_lkb *lkb)
@@ -1306,13 +1321,17 @@ static inline void unhold_lkb(struct dlm_lkb *lkb)
static void lkb_add_ordered(struct list_head *new, struct list_head *head,
int mode)
{
- struct dlm_lkb *lkb = NULL;
+ struct dlm_lkb *lkb = NULL, *iter;
- list_for_each_entry(lkb, head, lkb_statequeue)
- if (lkb->lkb_rqmode < mode)
+ list_for_each_entry(iter, head, lkb_statequeue)
+ if (iter->lkb_rqmode < mode) {
+ lkb = iter;
+ list_add_tail(new, &iter->lkb_statequeue);
break;
+ }
- __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+ if (!lkb)
+ list_add_tail(new, head);
}
/* add/remove lkb to rsb's grant/convert/wait queue */
@@ -1559,6 +1578,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
lkb->lkb_wait_type = 0;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
goto out_del;
}
@@ -1571,8 +1591,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
}
log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
- lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid,
- mstype, lkb->lkb_flags);
+ lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0,
+ lkb->lkb_remid, mstype, lkb->lkb_flags);
return -1;
out_del:
@@ -1585,6 +1605,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
log_error(ls, "remwait error %x reply %d wait_type %d overlap",
lkb->lkb_id, mstype, lkb->lkb_wait_type);
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
lkb->lkb_wait_type = 0;
}
@@ -1617,10 +1638,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
- if (ms->m_flags != DLM_IFL_STUB_MS)
+ if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
mutex_lock(&ls->ls_waiters_mutex);
- error = _remove_from_waiters(lkb, ms->m_type, ms);
- if (ms->m_flags != DLM_IFL_STUB_MS)
+ error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
+ if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
mutex_unlock(&ls->ls_waiters_mutex);
return error;
}
@@ -1795,7 +1816,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
- wake_up(&ls->ls_remove_wait);
send_remove(r);
@@ -1804,6 +1824,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
+ wake_up(&ls->ls_remove_wait);
dlm_free_rsb(r);
}
@@ -1866,7 +1887,7 @@ static void del_timeout(struct dlm_lkb *lkb)
void dlm_scan_timeout(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
int do_cancel, do_warn;
s64 wait_us;
@@ -1877,27 +1898,28 @@ void dlm_scan_timeout(struct dlm_ls *ls)
do_cancel = 0;
do_warn = 0;
mutex_lock(&ls->ls_timeout_mutex);
- list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+ list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
wait_us = ktime_to_us(ktime_sub(ktime_get(),
- lkb->lkb_timestamp));
+ iter->lkb_timestamp));
- if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
- wait_us >= (lkb->lkb_timeout_cs * 10000))
+ if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
+ wait_us >= (iter->lkb_timeout_cs * 10000))
do_cancel = 1;
- if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+ if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
wait_us >= dlm_config.ci_timewarn_cs * 10000)
do_warn = 1;
if (!do_cancel && !do_warn)
continue;
- hold_lkb(lkb);
+ hold_lkb(iter);
+ lkb = iter;
break;
}
mutex_unlock(&ls->ls_timeout_mutex);
- if (!do_cancel && !do_warn)
+ if (!lkb)
break;
r = lkb->lkb_resource;
@@ -2051,7 +2073,7 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
if (len > r->res_ls->ls_lvblen)
len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
- lkb->lkb_lvbseq = ms->m_lvbseq;
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
}
}
@@ -2182,10 +2204,10 @@ static void munge_demoted(struct dlm_lkb *lkb)
static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
- ms->m_type != DLM_MSG_GRANT) {
+ if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
+ ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
log_print("munge_altmode %x invalid reply type %d",
- lkb->lkb_id, ms->m_type);
+ lkb->lkb_id, le32_to_cpu(ms->m_type));
return;
}
@@ -2912,7 +2934,8 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
- if (lkb->lkb_wait_type)
+ /* lock not allowed if there's any op in progress */
+ if (lkb->lkb_wait_type || lkb->lkb_wait_count)
goto out;
if (is_overlap(lkb))
@@ -3563,13 +3586,13 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
ms = (struct dlm_message *) mb;
- ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- ms->m_header.u.h_lockspace = ls->ls_global_id;
- ms->m_header.h_nodeid = dlm_our_nodeid();
- ms->m_header.h_length = mb_len;
+ ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ ms->m_header.h_length = cpu_to_le16(mb_len);
ms->m_header.h_cmd = DLM_MSG;
- ms->m_type = mstype;
+ ms->m_type = cpu_to_le32(mstype);
*mh_ret = mh;
*ms_ret = ms;
@@ -3608,7 +3631,6 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
{
- dlm_message_out(ms);
dlm_midcomms_commit_mhandle(mh);
return 0;
}
@@ -3616,40 +3638,40 @@ static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- ms->m_nodeid = lkb->lkb_nodeid;
- ms->m_pid = lkb->lkb_ownpid;
- ms->m_lkid = lkb->lkb_id;
- ms->m_remid = lkb->lkb_remid;
- ms->m_exflags = lkb->lkb_exflags;
- ms->m_sbflags = lkb->lkb_sbflags;
- ms->m_flags = lkb->lkb_flags;
- ms->m_lvbseq = lkb->lkb_lvbseq;
- ms->m_status = lkb->lkb_status;
- ms->m_grmode = lkb->lkb_grmode;
- ms->m_rqmode = lkb->lkb_rqmode;
- ms->m_hash = r->res_hash;
+ ms->m_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ ms->m_pid = cpu_to_le32(lkb->lkb_ownpid);
+ ms->m_lkid = cpu_to_le32(lkb->lkb_id);
+ ms->m_remid = cpu_to_le32(lkb->lkb_remid);
+ ms->m_exflags = cpu_to_le32(lkb->lkb_exflags);
+ ms->m_sbflags = cpu_to_le32(lkb->lkb_sbflags);
+ ms->m_flags = cpu_to_le32(lkb->lkb_flags);
+ ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+ ms->m_status = cpu_to_le32(lkb->lkb_status);
+ ms->m_grmode = cpu_to_le32(lkb->lkb_grmode);
+ ms->m_rqmode = cpu_to_le32(lkb->lkb_rqmode);
+ ms->m_hash = cpu_to_le32(r->res_hash);
/* m_result and m_bastmode are set from function args,
not from lkb fields */
if (lkb->lkb_bastfn)
- ms->m_asts |= DLM_CB_BAST;
+ ms->m_asts |= cpu_to_le32(DLM_CB_BAST);
if (lkb->lkb_astfn)
- ms->m_asts |= DLM_CB_CAST;
+ ms->m_asts |= cpu_to_le32(DLM_CB_CAST);
/* compare with switch in create_message; send_remove() doesn't
use send_args() */
switch (ms->m_type) {
- case DLM_MSG_REQUEST:
- case DLM_MSG_LOOKUP:
+ case cpu_to_le32(DLM_MSG_REQUEST):
+ case cpu_to_le32(DLM_MSG_LOOKUP):
memcpy(ms->m_extra, r->res_name, r->res_length);
break;
- case DLM_MSG_CONVERT:
- case DLM_MSG_UNLOCK:
- case DLM_MSG_REQUEST_REPLY:
- case DLM_MSG_CONVERT_REPLY:
- case DLM_MSG_GRANT:
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
if (!lkb->lkb_lvbptr)
break;
memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
@@ -3699,8 +3721,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
/* down conversions go without a reply from the master */
if (!error && down_conversion(lkb)) {
remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
- r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
- r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+ r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
r->res_ls->ls_stub_ms.m_result = 0;
__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
}
@@ -3757,7 +3779,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
send_args(r, lkb, ms);
- ms->m_bastmode = mode;
+ ms->m_bastmode = cpu_to_le32(mode);
error = send_message(mh, ms);
out:
@@ -3805,7 +3827,7 @@ static int send_remove(struct dlm_rsb *r)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
- ms->m_hash = r->res_hash;
+ ms->m_hash = cpu_to_le32(r->res_hash);
error = send_message(mh, ms);
out:
@@ -3827,7 +3849,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
send_args(r, lkb, ms);
- ms->m_result = rv;
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
error = send_message(mh, ms);
out:
@@ -3860,15 +3882,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
struct dlm_rsb *r = &ls->ls_stub_rsb;
struct dlm_message *ms;
struct dlm_mhandle *mh;
- int error, nodeid = ms_in->m_header.h_nodeid;
+ int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
if (error)
goto out;
ms->m_lkid = ms_in->m_lkid;
- ms->m_result = rv;
- ms->m_nodeid = ret_nodeid;
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
+ ms->m_nodeid = cpu_to_le32(ret_nodeid);
error = send_message(mh, ms);
out:
@@ -3881,25 +3903,26 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- lkb->lkb_exflags = ms->m_exflags;
- lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
+ lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
- (ms->m_flags & 0x0000FFFF);
+ (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
}
static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- if (ms->m_flags == DLM_IFL_STUB_MS)
+ if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS))
return;
- lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
- (ms->m_flags & 0x0000FFFF);
+ (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
}
static int receive_extralen(struct dlm_message *ms)
{
- return (ms->m_header.h_length - sizeof(struct dlm_message));
+ return (le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message));
}
static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
@@ -3933,14 +3956,14 @@ static void fake_astfn(void *astparam)
static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- lkb->lkb_nodeid = ms->m_header.h_nodeid;
- lkb->lkb_ownpid = ms->m_pid;
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
lkb->lkb_grmode = DLM_LOCK_IV;
- lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
- lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
- lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+ lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL;
+ lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL;
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
/* lkb was just created so there won't be an lvb yet */
@@ -3961,8 +3984,8 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
- lkb->lkb_rqmode = ms->m_rqmode;
- lkb->lkb_lvbseq = ms->m_lvbseq;
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
return 0;
}
@@ -3981,8 +4004,8 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb = &ls->ls_stub_lkb;
- lkb->lkb_nodeid = ms->m_header.h_nodeid;
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
}
/* This is called after the rsb is locked so that we can safely inspect
@@ -3990,11 +4013,12 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- int from = ms->m_header.h_nodeid;
+ int from = le32_to_cpu(ms->m_header.h_nodeid);
int error = 0;
/* currently mixing of user/kernel locks are not supported */
- if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) {
+ if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) &&
+ ~lkb->lkb_flags & DLM_IFL_USER) {
log_error(lkb->lkb_resource->res_ls,
"got user dlm message for a kernel lock");
error = -EINVAL;
@@ -4002,23 +4026,23 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
}
switch (ms->m_type) {
- case DLM_MSG_CONVERT:
- case DLM_MSG_UNLOCK:
- case DLM_MSG_CANCEL:
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_CANCEL):
if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
error = -EINVAL;
break;
- case DLM_MSG_CONVERT_REPLY:
- case DLM_MSG_UNLOCK_REPLY:
- case DLM_MSG_CANCEL_REPLY:
- case DLM_MSG_GRANT:
- case DLM_MSG_BAST:
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
+ case cpu_to_le32(DLM_MSG_BAST):
if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
error = -EINVAL;
break;
- case DLM_MSG_REQUEST_REPLY:
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
if (!is_process_copy(lkb))
error = -EINVAL;
else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
@@ -4033,8 +4057,8 @@ out:
if (error)
log_error(lkb->lkb_resource->res_ls,
"ignore invalid message %d from %d %x %x %x %d",
- ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
- lkb->lkb_flags, lkb->lkb_nodeid);
+ le32_to_cpu(ms->m_type), from, lkb->lkb_id,
+ lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid);
return error;
}
@@ -4079,22 +4103,23 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
- wake_up(&ls->ls_remove_wait);
rv = _create_message(ls, sizeof(struct dlm_message) + len,
dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
if (rv)
- return;
+ goto out;
memcpy(ms->m_extra, name, len);
- ms->m_hash = hash;
+ ms->m_hash = cpu_to_le32(hash);
send_message(mh, ms);
+out:
spin_lock(&ls->ls_remove_spin);
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
+ wake_up(&ls->ls_remove_wait);
}
static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4104,7 +4129,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
int from_nodeid;
int error, namelen = 0;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
error = create_lkb(ls, &lkb);
if (error)
@@ -4177,7 +4202,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
if (error != -ENOTBLK) {
log_limit(ls, "receive_request %x from %d %d",
- ms->m_lkid, from_nodeid, error);
+ le32_to_cpu(ms->m_lkid), from_nodeid, error);
}
if (namelen && error == -EBADR) {
@@ -4196,15 +4221,16 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error, reply = 1;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
- if (lkb->lkb_remid != ms->m_lkid) {
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
log_error(ls, "receive_convert %x remid %x recover_seq %llu "
"remote %d %x", lkb->lkb_id, lkb->lkb_remid,
(unsigned long long)lkb->lkb_recover_seq,
- ms->m_header.h_nodeid, ms->m_lkid);
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
error = -ENOENT;
dlm_put_lkb(lkb);
goto fail;
@@ -4251,14 +4277,15 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
- if (lkb->lkb_remid != ms->m_lkid) {
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
log_error(ls, "receive_unlock %x remid %x remote %d %x",
lkb->lkb_id, lkb->lkb_remid,
- ms->m_header.h_nodeid, ms->m_lkid);
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
error = -ENOENT;
dlm_put_lkb(lkb);
goto fail;
@@ -4302,7 +4329,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
@@ -4338,7 +4365,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4369,7 +4396,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4382,8 +4409,8 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto out;
- queue_bast(r, lkb, ms->m_bastmode);
- lkb->lkb_highbast = ms->m_bastmode;
+ queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode));
+ lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode);
out:
unlock_rsb(r);
put_rsb(r);
@@ -4395,7 +4422,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
{
int len, error, ret_nodeid, from_nodeid, our_nodeid;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
our_nodeid = dlm_our_nodeid();
len = receive_extralen(ms);
@@ -4418,7 +4445,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
uint32_t hash, b;
int rv, len, dir_nodeid, from_nodeid;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
len = receive_extralen(ms);
@@ -4428,7 +4455,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
return;
}
- dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash));
if (dir_nodeid != dlm_our_nodeid()) {
log_error(ls, "receive_remove from %d bad nodeid %d",
from_nodeid, dir_nodeid);
@@ -4501,7 +4528,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
{
- do_purge(ls, ms->m_nodeid, ms->m_pid);
+ do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
}
static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4509,9 +4536,9 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, mstype, result;
- int from_nodeid = ms->m_header.h_nodeid;
+ int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4527,7 +4554,8 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
if (error) {
log_error(ls, "receive_request_reply %x remote %d %x result %d",
- lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result);
+ lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
dlm_dump_rsb(r);
goto out;
}
@@ -4541,7 +4569,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
}
/* this is the value returned from do_request() on the master */
- result = ms->m_result;
+ result = from_dlm_errno(le32_to_cpu(ms->m_result));
switch (result) {
case -EAGAIN:
@@ -4555,7 +4583,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
case 0:
/* request was queued or granted on remote master */
receive_flags_reply(lkb, ms);
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
if (is_altmode(lkb))
munge_altmode(lkb, ms);
if (result) {
@@ -4628,7 +4656,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
/* this is the value returned from do_convert() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -EAGAIN:
/* convert would block (be queued) on remote master */
queue_cast(r, lkb, -EAGAIN);
@@ -4661,8 +4689,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
default:
log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
- lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid,
- ms->m_result);
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
dlm_print_rsb(r);
dlm_print_lkb(lkb);
}
@@ -4696,7 +4725,7 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4724,7 +4753,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
/* this is the value returned from do_unlock() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -DLM_EUNLOCK:
receive_flags_reply(lkb, ms);
remove_lock_pc(r, lkb);
@@ -4734,7 +4763,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
break;
default:
log_error(r->res_ls, "receive_unlock_reply %x error %d",
- lkb->lkb_id, ms->m_result);
+ lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result)));
}
out:
unlock_rsb(r);
@@ -4746,7 +4775,7 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4774,7 +4803,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
/* this is the value returned from do_cancel() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -DLM_ECANCEL:
receive_flags_reply(lkb, ms);
revert_lock_pc(r, lkb);
@@ -4784,7 +4813,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
break;
default:
log_error(r->res_ls, "receive_cancel_reply %x error %d",
- lkb->lkb_id, ms->m_result);
+ lkb->lkb_id,
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
}
out:
unlock_rsb(r);
@@ -4796,7 +4826,7 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4812,9 +4842,10 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
int error, ret_nodeid;
int do_lookup_list = 0;
- error = find_lkb(ls, ms->m_lkid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb);
if (error) {
- log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid);
+ log_error(ls, "%s no lkid %x", __func__,
+ le32_to_cpu(ms->m_lkid));
return;
}
@@ -4829,7 +4860,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto out;
- ret_nodeid = ms->m_nodeid;
+ ret_nodeid = le32_to_cpu(ms->m_nodeid);
/* We sometimes receive a request from the dir node for this
rsb before we've received the dir node's loookup_reply for it.
@@ -4841,8 +4872,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
/* This should never happen */
log_error(ls, "receive_lookup_reply %x from %d ret %d "
"master %d dir %d our %d first %x %s",
- lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid,
- r->res_master_nodeid, r->res_dir_nodeid,
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
dlm_our_nodeid(), r->res_first_lkid, r->res_name);
}
@@ -4854,7 +4885,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
} else if (ret_nodeid == -1) {
/* the remote node doesn't believe it's the dir node */
log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
- lkb->lkb_id, ms->m_header.h_nodeid);
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid));
r->res_master_nodeid = 0;
r->res_nodeid = -1;
lkb->lkb_nodeid = -1;
@@ -4888,10 +4919,12 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
{
int error = 0, noent = 0;
- if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+ if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) {
log_limit(ls, "receive %d from non-member %d %x %x %d",
- ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
- ms->m_remid, ms->m_result);
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
return;
}
@@ -4899,77 +4932,78 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
/* messages sent to a master node */
- case DLM_MSG_REQUEST:
+ case cpu_to_le32(DLM_MSG_REQUEST):
error = receive_request(ls, ms);
break;
- case DLM_MSG_CONVERT:
+ case cpu_to_le32(DLM_MSG_CONVERT):
error = receive_convert(ls, ms);
break;
- case DLM_MSG_UNLOCK:
+ case cpu_to_le32(DLM_MSG_UNLOCK):
error = receive_unlock(ls, ms);
break;
- case DLM_MSG_CANCEL:
+ case cpu_to_le32(DLM_MSG_CANCEL):
noent = 1;
error = receive_cancel(ls, ms);
break;
/* messages sent from a master node (replies to above) */
- case DLM_MSG_REQUEST_REPLY:
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
error = receive_request_reply(ls, ms);
break;
- case DLM_MSG_CONVERT_REPLY:
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
error = receive_convert_reply(ls, ms);
break;
- case DLM_MSG_UNLOCK_REPLY:
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
error = receive_unlock_reply(ls, ms);
break;
- case DLM_MSG_CANCEL_REPLY:
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
error = receive_cancel_reply(ls, ms);
break;
/* messages sent from a master node (only two types of async msg) */
- case DLM_MSG_GRANT:
+ case cpu_to_le32(DLM_MSG_GRANT):
noent = 1;
error = receive_grant(ls, ms);
break;
- case DLM_MSG_BAST:
+ case cpu_to_le32(DLM_MSG_BAST):
noent = 1;
error = receive_bast(ls, ms);
break;
/* messages sent to a dir node */
- case DLM_MSG_LOOKUP:
+ case cpu_to_le32(DLM_MSG_LOOKUP):
receive_lookup(ls, ms);
break;
- case DLM_MSG_REMOVE:
+ case cpu_to_le32(DLM_MSG_REMOVE):
receive_remove(ls, ms);
break;
/* messages sent from a dir node (remove has no reply) */
- case DLM_MSG_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_MSG_LOOKUP_REPLY):
receive_lookup_reply(ls, ms);
break;
/* other messages */
- case DLM_MSG_PURGE:
+ case cpu_to_le32(DLM_MSG_PURGE):
receive_purge(ls, ms);
break;
default:
- log_error(ls, "unknown message type %d", ms->m_type);
+ log_error(ls, "unknown message type %d",
+ le32_to_cpu(ms->m_type));
}
/*
@@ -4985,22 +5019,26 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
if (error == -ENOENT && noent) {
log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
- ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
- ms->m_lkid, saved_seq);
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
} else if (error == -ENOENT) {
log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
- ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
- ms->m_lkid, saved_seq);
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
- if (ms->m_type == DLM_MSG_CONVERT)
- dlm_dump_rsb_hash(ls, ms->m_hash);
+ if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT))
+ dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash));
}
if (error == -EINVAL) {
log_error(ls, "receive %d inval from %d lkid %x remid %x "
"saved_seq %u",
- ms->m_type, ms->m_header.h_nodeid,
- ms->m_lkid, ms->m_remid, saved_seq);
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ saved_seq);
}
}
@@ -5021,7 +5059,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
lockspace generation before we left. */
if (!ls->ls_generation) {
log_limit(ls, "receive %d from %d ignore old gen",
- ms->m_type, nodeid);
+ le32_to_cpu(ms->m_type), nodeid);
return;
}
@@ -5054,30 +5092,30 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
switch (hd->h_cmd) {
case DLM_MSG:
- dlm_message_in(&p->message);
- type = p->message.m_type;
+ type = le32_to_cpu(p->message.m_type);
break;
case DLM_RCOM:
- dlm_rcom_in(&p->rcom);
- type = p->rcom.rc_type;
+ type = le32_to_cpu(p->rcom.rc_type);
break;
default:
log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
return;
}
- if (hd->h_nodeid != nodeid) {
+ if (le32_to_cpu(hd->h_nodeid) != nodeid) {
log_print("invalid h_nodeid %d from %d lockspace %x",
- hd->h_nodeid, nodeid, hd->u.h_lockspace);
+ le32_to_cpu(hd->h_nodeid), nodeid,
+ le32_to_cpu(hd->u.h_lockspace));
return;
}
- ls = dlm_find_lockspace_global(hd->u.h_lockspace);
+ ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace));
if (!ls) {
if (dlm_config.ci_log_debug) {
printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
"%u from %d cmd %d type %d\n",
- hd->u.h_lockspace, nodeid, hd->h_cmd, type);
+ le32_to_cpu(hd->u.h_lockspace), nodeid,
+ hd->h_cmd, type);
}
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
@@ -5104,10 +5142,10 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (middle_conversion(lkb)) {
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
- ms_stub->m_result = -EINPROGRESS;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_convert_reply(lkb, ms_stub);
/* Same special case as in receive_rcom_lock_args() */
@@ -5226,10 +5264,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
- ms_stub->m_result = stub_unlock_result;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_unlock_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -5237,10 +5275,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
case DLM_MSG_CANCEL:
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
- ms_stub->m_result = stub_cancel_result;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_cancel_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -5257,21 +5295,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
{
- struct dlm_lkb *lkb;
- int found = 0;
+ struct dlm_lkb *lkb = NULL, *iter;
mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (lkb->lkb_flags & DLM_IFL_RESEND) {
- hold_lkb(lkb);
- found = 1;
+ list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
+ if (iter->lkb_flags & DLM_IFL_RESEND) {
+ hold_lkb(iter);
+ lkb = iter;
break;
}
}
mutex_unlock(&ls->ls_waiters_mutex);
- if (!found)
- lkb = NULL;
return lkb;
}
@@ -5331,11 +5366,16 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_type = 0;
- lkb->lkb_wait_count = 0;
+ /* drop all wait_count references we still
+ * hold a reference for this iteration.
+ */
+ while (lkb->lkb_wait_count) {
+ lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
+ }
mutex_lock(&ls->ls_waiters_mutex);
list_del_init(&lkb->lkb_wait_reply);
mutex_unlock(&ls->ls_waiters_mutex);
- unhold_lkb(lkb); /* for waiters list */
if (oc || ou) {
/* do an unlock or cancel instead of resending */
@@ -5605,7 +5645,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
- lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+ lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
@@ -5620,8 +5660,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
- int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
- sizeof(struct rcom_lock);
+ int lvblen = le16_to_cpu(rc->rc_header.h_length) -
+ sizeof(struct dlm_rcom) - sizeof(struct rcom_lock);
if (lvblen > ls->ls_lvblen)
return -EINVAL;
lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
@@ -5657,7 +5697,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
struct dlm_rsb *r;
struct dlm_lkb *lkb;
uint32_t remid = 0;
- int from_nodeid = rc->rc_header.h_nodeid;
+ int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
int error;
if (rl->rl_parent_lkid) {
@@ -5707,7 +5747,6 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
attach_lkb(r, lkb);
add_lkb(r, lkb, rl->rl_status);
- error = 0;
ls->ls_recover_locks_in++;
if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
@@ -5747,7 +5786,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
error = find_lkb(ls, lkid, &lkb);
if (error) {
log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
return error;
}
@@ -5757,7 +5797,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
if (!is_process_copy(lkb)) {
log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
dlm_dump_rsb(r);
unlock_rsb(r);
put_rsb(r);
@@ -5772,7 +5813,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
a barrier between recover_masters and recover_locks. */
log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
dlm_send_rcom_lock(r, lkb);
goto out;
@@ -5782,7 +5824,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
break;
default:
log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
}
/* an ack for dlm_recover_locks() which waits for replies from
@@ -5925,37 +5968,36 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs, uint32_t *lkid)
{
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
struct dlm_user_args *ua;
int found_other_mode = 0;
- int found = 0;
int rv = 0;
mutex_lock(&ls->ls_orphans_mutex);
- list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) {
- if (lkb->lkb_resource->res_length != namelen)
+ list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
+ if (iter->lkb_resource->res_length != namelen)
continue;
- if (memcmp(lkb->lkb_resource->res_name, name, namelen))
+ if (memcmp(iter->lkb_resource->res_name, name, namelen))
continue;
- if (lkb->lkb_grmode != mode) {
+ if (iter->lkb_grmode != mode) {
found_other_mode = 1;
continue;
}
- found = 1;
- list_del_init(&lkb->lkb_ownqueue);
- lkb->lkb_flags &= ~DLM_IFL_ORPHAN;
- *lkid = lkb->lkb_id;
+ lkb = iter;
+ list_del_init(&iter->lkb_ownqueue);
+ iter->lkb_flags &= ~DLM_IFL_ORPHAN;
+ *lkid = iter->lkb_id;
break;
}
mutex_unlock(&ls->ls_orphans_mutex);
- if (!found && found_other_mode) {
+ if (!lkb && found_other_mode) {
rv = -EAGAIN;
goto out;
}
- if (!found) {
+ if (!lkb) {
rv = -ENOENT;
goto out;
}
@@ -6307,8 +6349,8 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
DLM_MSG_PURGE, &ms, &mh);
if (error)
return error;
- ms->m_nodeid = nodeid;
- ms->m_pid = pid;
+ ms->m_nodeid = cpu_to_le32(nodeid);
+ ms->m_pid = cpu_to_le32(pid);
return send_message(mh, ms);
}
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 0d3833a124a3..19ed41a5da93 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -922,3 +922,15 @@ void dlm_stop_lockspaces(void)
log_print("dlm user daemon left %d lockspaces", count);
}
+void dlm_stop_lockspaces_check(void)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (WARN_ON(!rwsem_is_locked(&ls->ls_in_recovery) ||
+ !dlm_locking_stopped(ls)))
+ break;
+ }
+ spin_unlock(&lslist_lock);
+}
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index a78d853b9342..306fc4f4ea15 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -19,6 +19,7 @@ struct dlm_ls *dlm_find_lockspace_local(void *id);
struct dlm_ls *dlm_find_lockspace_device(int minor);
void dlm_put_lockspace(struct dlm_ls *ls);
void dlm_stop_lockspaces(void);
+void dlm_stop_lockspaces_check(void);
#endif /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e284d696c1fd..19e82f08c0e0 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1303,6 +1303,10 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
return msg;
}
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_lowcomms_commit_msg which is a must call if success
+ */
+#ifndef __CHECKER__
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
char **ppc, void (*cb)(void *data),
void *data)
@@ -1336,6 +1340,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
msg->idx = idx;
return msg;
}
+#endif
static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
@@ -1362,11 +1367,16 @@ out:
return;
}
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_lowcomms_new_msg
+ */
+#ifndef __CHECKER__
void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
_dlm_lowcomms_commit_msg(msg);
srcu_read_unlock(&connections_srcu, msg->idx);
}
+#endif
void dlm_lowcomms_put_msg(struct dlm_msg *msg)
{
@@ -1789,7 +1799,7 @@ static int dlm_listen_for_all(void)
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
- goto out;
+ return result;
}
sock_set_mark(sock->sk, dlm_config.ci_mark);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 61f906e705db..98084e0cfccf 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -20,7 +20,7 @@
int dlm_slots_version(struct dlm_header *h)
{
- if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+ if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS)
return 0;
return 1;
}
@@ -120,18 +120,13 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
- for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
- ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
- ro->ro_slot = le16_to_cpu(ro->ro_slot);
- }
-
log_slots(ls, gen, num_slots, ro0, NULL, 0);
list_for_each_entry(memb, &ls->ls_nodes, list) {
for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
- if (ro->ro_nodeid != memb->nodeid)
+ if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid)
continue;
- memb->slot = ro->ro_slot;
+ memb->slot = le16_to_cpu(ro->ro_slot);
memb->slot_prev = memb->slot;
break;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 3635e42b0669..6489bc22ad61 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -135,6 +135,7 @@
#include <net/tcp.h>
#include "dlm_internal.h"
+#include "lockspace.h"
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
@@ -380,13 +381,12 @@ static int dlm_send_ack(int nodeid, uint32_t seq)
m_header = (struct dlm_header *)ppc;
- m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- m_header->h_nodeid = dlm_our_nodeid();
- m_header->h_length = mb_len;
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
m_header->h_cmd = DLM_ACK;
- m_header->u.h_seq = seq;
+ m_header->u.h_seq = cpu_to_le32(seq);
- header_out(m_header);
dlm_lowcomms_commit_msg(msg);
dlm_lowcomms_put_msg(msg);
@@ -409,13 +409,11 @@ static int dlm_send_fin(struct midcomms_node *node,
m_header = (struct dlm_header *)ppc;
- m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- m_header->h_nodeid = dlm_our_nodeid();
- m_header->h_length = mb_len;
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
m_header->h_cmd = DLM_FIN;
- header_out(m_header);
-
pr_debug("sending fin msg to node %d\n", node->nodeid);
dlm_midcomms_commit_mhandle(mh);
set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
@@ -574,14 +572,14 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p,
return NULL;
}
- switch (le32_to_cpu(p->rcom.rc_type)) {
- case DLM_RCOM_NAMES:
+ switch (p->rcom.rc_type) {
+ case cpu_to_le32(DLM_RCOM_NAMES):
fallthrough;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
fallthrough;
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
fallthrough;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
node = nodeid2node(nodeid, 0);
if (node) {
spin_lock(&node->state_lock);
@@ -741,14 +739,14 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
*
* length already checked.
*/
- switch (le32_to_cpu(p->rcom.rc_type)) {
- case DLM_RCOM_NAMES:
+ switch (p->rcom.rc_type) {
+ case cpu_to_le32(DLM_RCOM_NAMES):
fallthrough;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
fallthrough;
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
fallthrough;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
break;
default:
log_print("unsupported rcom type received: %u, will skip this message from node %d",
@@ -1020,11 +1018,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
uint32_t seq)
{
opts->o_header.h_cmd = DLM_OPTS;
- opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- opts->o_header.h_nodeid = dlm_our_nodeid();
- opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len;
- opts->o_header.u.h_seq = seq;
- header_out(&opts->o_header);
+ opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len);
+ opts->o_header.u.h_seq = cpu_to_le32(seq);
}
static void midcomms_new_msg_cb(void *data)
@@ -1062,6 +1059,10 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
return msg;
}
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_midcomms_commit_mhandle which is a must call if success
+ */
+#ifndef __CHECKER__
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc)
{
@@ -1127,6 +1128,7 @@ err:
srcu_read_unlock(&nodes_srcu, idx);
return NULL;
}
+#endif
static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
{
@@ -1136,6 +1138,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
dlm_lowcomms_commit_msg(mh->msg);
}
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_midcomms_get_mhandle
+ */
+#ifndef __CHECKER__
void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
{
switch (mh->node->version) {
@@ -1157,6 +1163,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
break;
}
}
+#endif
int dlm_midcomms_start(void)
{
@@ -1406,6 +1413,8 @@ int dlm_midcomms_close(int nodeid)
if (nodeid == dlm_our_nodeid())
return 0;
+ dlm_stop_lockspaces_check();
+
idx = srcu_read_lock(&nodes_srcu);
/* Abort pending close/remove operation */
node = nodeid2node(nodeid, 0);
@@ -1455,7 +1464,7 @@ static void midcomms_new_rawmsg_cb(void *data)
switch (h->h_cmd) {
case DLM_OPTS:
if (!h->u.h_seq)
- h->u.h_seq = rd->node->seq_send++;
+ h->u.h_seq = cpu_to_le32(rd->node->seq_send++);
break;
default:
break;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index c38b2b8ffd1d..0993eebf2060 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -13,26 +13,26 @@
#include "dlm_internal.h"
#include "lockspace.h"
-static spinlock_t ops_lock;
-static struct list_head send_list;
-static struct list_head recv_list;
-static wait_queue_head_t send_wq;
-static wait_queue_head_t recv_wq;
+static DEFINE_SPINLOCK(ops_lock);
+static LIST_HEAD(send_list);
+static LIST_HEAD(recv_list);
+static DECLARE_WAIT_QUEUE_HEAD(send_wq);
+static DECLARE_WAIT_QUEUE_HEAD(recv_wq);
-struct plock_op {
- struct list_head list;
- int done;
- struct dlm_plock_info info;
-};
-
-struct plock_xop {
- struct plock_op xop;
- int (*callback)(struct file_lock *fl, int result);
+struct plock_async_data {
void *fl;
void *file;
struct file_lock flc;
+ int (*callback)(struct file_lock *fl, int result);
};
+struct plock_op {
+ struct list_head list;
+ int done;
+ struct dlm_plock_info info;
+ /* if set indicates async handling */
+ struct plock_async_data *data;
+};
static inline void set_version(struct dlm_plock_info *info)
{
@@ -58,10 +58,15 @@ static int check_version(struct dlm_plock_info *info)
return 0;
}
+static void dlm_release_plock_op(struct plock_op *op)
+{
+ kfree(op->data);
+ kfree(op);
+}
+
static void send_op(struct plock_op *op)
{
set_version(&op->info);
- INIT_LIST_HEAD(&op->list);
spin_lock(&ops_lock);
list_add_tail(&op->list, &send_list);
spin_unlock(&ops_lock);
@@ -101,22 +106,21 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
int cmd, struct file_lock *fl)
{
+ struct plock_async_data *op_data;
struct dlm_ls *ls;
struct plock_op *op;
- struct plock_xop *xop;
int rv;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
- xop = kzalloc(sizeof(*xop), GFP_NOFS);
- if (!xop) {
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
rv = -ENOMEM;
goto out;
}
- op = &xop->xop;
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
@@ -125,46 +129,49 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
+ /* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ dlm_release_plock_op(op);
+ rv = -ENOMEM;
+ goto out;
+ }
+
/* fl_owner is lockd which doesn't distinguish
processes on the nfs client */
op->info.owner = (__u64) fl->fl_pid;
- xop->callback = fl->fl_lmops->lm_grant;
- locks_init_lock(&xop->flc);
- locks_copy_lock(&xop->flc, fl);
- xop->fl = fl;
- xop->file = file;
+ op_data->callback = fl->fl_lmops->lm_grant;
+ locks_init_lock(&op_data->flc);
+ locks_copy_lock(&op_data->flc, fl);
+ op_data->fl = fl;
+ op_data->file = file;
+
+ op->data = op_data;
+
+ send_op(op);
+ rv = FILE_LOCK_DEFERRED;
+ goto out;
} else {
op->info.owner = (__u64)(long) fl->fl_owner;
- xop->callback = NULL;
}
send_op(op);
- if (xop->callback == NULL) {
- rv = wait_event_interruptible(recv_wq, (op->done != 0));
- if (rv == -ERESTARTSYS) {
- log_debug(ls, "dlm_posix_lock: wait killed %llx",
- (unsigned long long)number);
- spin_lock(&ops_lock);
- list_del(&op->list);
- spin_unlock(&ops_lock);
- kfree(xop);
- do_unlock_close(ls, number, file, fl);
- goto out;
- }
- } else {
- rv = FILE_LOCK_DEFERRED;
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ log_print("%s: wait interrupted %x %llx, op removed",
+ __func__, ls->ls_global_id,
+ (unsigned long long)number);
+ dlm_release_plock_op(op);
+ do_unlock_close(ls, number, file, fl);
goto out;
}
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_lock: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -174,7 +181,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
(unsigned long long)number);
}
- kfree(xop);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -184,26 +191,20 @@ EXPORT_SYMBOL_GPL(dlm_posix_lock);
/* Returns failure iff a successful lock operation should be canceled */
static int dlm_plock_callback(struct plock_op *op)
{
+ struct plock_async_data *op_data = op->data;
struct file *file;
struct file_lock *fl;
struct file_lock *flc;
int (*notify)(struct file_lock *fl, int result) = NULL;
- struct plock_xop *xop = (struct plock_xop *)op;
int rv = 0;
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_print("dlm_plock_callback: op on list %llx",
- (unsigned long long)op->info.number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
/* check if the following 2 are still valid or make a copy */
- file = xop->file;
- flc = &xop->flc;
- fl = xop->fl;
- notify = xop->callback;
+ file = op_data->file;
+ flc = &op_data->flc;
+ fl = op_data->fl;
+ notify = op_data->callback;
if (op->info.rv) {
notify(fl, op->info.rv);
@@ -234,7 +235,7 @@ static int dlm_plock_callback(struct plock_op *op)
}
out:
- kfree(xop);
+ dlm_release_plock_op(op);
return rv;
}
@@ -290,13 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
wait_event(recv_wq, (op->done != 0));
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_unlock: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -304,7 +299,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = 0;
out_free:
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
fl->fl_flags = fl_flags;
@@ -344,13 +339,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
wait_event(recv_wq, (op->done != 0));
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_get: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
-ENOENT if there are no locks on the file */
@@ -370,7 +359,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = 0;
}
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -406,7 +395,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
(the process did not make an unlock call). */
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
- kfree(op);
+ dlm_release_plock_op(op);
if (copy_to_user(u, &info, sizeof(info)))
return -EFAULT;
@@ -418,9 +407,9 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
loff_t *ppos)
{
+ struct plock_op *op = NULL, *iter;
struct dlm_plock_info info;
- struct plock_op *op;
- int found = 0, do_callback = 0;
+ int do_callback = 0;
if (count != sizeof(info))
return -EINVAL;
@@ -432,31 +421,30 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
return -EINVAL;
spin_lock(&ops_lock);
- list_for_each_entry(op, &recv_list, list) {
- if (op->info.fsid == info.fsid &&
- op->info.number == info.number &&
- op->info.owner == info.owner) {
- struct plock_xop *xop = (struct plock_xop *)op;
- list_del_init(&op->list);
- memcpy(&op->info, &info, sizeof(info));
- if (xop->callback)
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info.fsid &&
+ iter->info.number == info.number &&
+ iter->info.owner == info.owner) {
+ list_del_init(&iter->list);
+ memcpy(&iter->info, &info, sizeof(info));
+ if (iter->data)
do_callback = 1;
else
- op->done = 1;
- found = 1;
+ iter->done = 1;
+ op = iter;
break;
}
}
spin_unlock(&ops_lock);
- if (found) {
+ if (op) {
if (do_callback)
dlm_plock_callback(op);
else
wake_up(&recv_wq);
} else
- log_print("dev_write no op %x %llx", info.fsid,
- (unsigned long long)info.number);
+ log_print("%s: no op %x %llx - may got interrupted?", __func__,
+ info.fsid, (unsigned long long)info.number);
return count;
}
@@ -492,12 +480,6 @@ int dlm_plock_init(void)
{
int rv;
- spin_lock_init(&ops_lock);
- INIT_LIST_HEAD(&send_list);
- INIT_LIST_HEAD(&recv_list);
- init_waitqueue_head(&send_wq);
- init_waitqueue_head(&recv_wq);
-
rv = misc_register(&plock_dev_misc);
if (rv)
log_print("dlm_plock_init: misc_register failed %d", rv);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 5821b777a1a7..f19860315043 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -34,16 +34,16 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
rc = (struct dlm_rcom *) mb;
- rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.u.h_lockspace = ls->ls_global_id;
- rc->rc_header.h_nodeid = dlm_our_nodeid();
- rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
rc->rc_header.h_cmd = DLM_RCOM;
- rc->rc_type = type;
+ rc->rc_type = cpu_to_le32(type);
spin_lock(&ls->ls_recover_lock);
- rc->rc_seq = ls->ls_recover_seq;
+ rc->rc_seq = cpu_to_le64(ls->ls_recover_seq);
spin_unlock(&ls->ls_recover_lock);
*rc_ret = rc;
@@ -91,13 +91,11 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
dlm_midcomms_commit_mhandle(mh);
}
static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
dlm_lowcomms_commit_msg(msg);
dlm_lowcomms_put_msg(msg);
}
@@ -127,10 +125,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
{
struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
- if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+ if ((le32_to_cpu(rc->rc_header.h_version) & 0xFFFF0000) != DLM_HEADER_MAJOR) {
log_error(ls, "version mismatch: %x nodeid %d: %x",
DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
- rc->rc_header.h_version);
+ le32_to_cpu(rc->rc_header.h_version));
return -EPROTO;
}
@@ -145,10 +143,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
return 0;
}
-static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
+static void allow_sync_reply(struct dlm_ls *ls, __le64 *new_seq)
{
spin_lock(&ls->ls_rcom_spin);
- *new_seq = ++ls->ls_rcom_seq;
+ *new_seq = cpu_to_le64(++ls->ls_rcom_seq);
set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
spin_unlock(&ls->ls_rcom_spin);
}
@@ -182,7 +180,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
if (nodeid == dlm_our_nodeid()) {
rc = ls->ls_recover_buf;
- rc->rc_result = dlm_recover_status(ls);
+ rc->rc_result = cpu_to_le32(dlm_recover_status(ls));
goto out;
}
@@ -208,7 +206,7 @@ retry:
rc = ls->ls_recover_buf;
- if (rc->rc_result == -ESRCH) {
+ if (rc->rc_result == cpu_to_le32(-ESRCH)) {
/* we pretend the remote lockspace exists with 0 status */
log_debug(ls, "remote node %d not ready", nodeid);
rc->rc_result = 0;
@@ -227,7 +225,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
struct dlm_rcom *rc;
struct rcom_status *rs;
uint32_t status;
- int nodeid = rc_in->rc_header.h_nodeid;
+ int nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
int len = sizeof(struct rcom_config);
struct dlm_msg *msg;
int num_slots = 0;
@@ -259,7 +257,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = status;
+ rc->rc_result = cpu_to_le32(status);
set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
@@ -287,14 +285,16 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
spin_lock(&ls->ls_rcom_spin);
if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
- rc_in->rc_id != ls->ls_rcom_seq) {
+ le64_to_cpu(rc_in->rc_id) != ls->ls_rcom_seq) {
log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
- rc_in->rc_type, rc_in->rc_header.h_nodeid,
- (unsigned long long)rc_in->rc_id,
+ le32_to_cpu(rc_in->rc_type),
+ le32_to_cpu(rc_in->rc_header.h_nodeid),
+ (unsigned long long)le64_to_cpu(rc_in->rc_id),
(unsigned long long)ls->ls_rcom_seq);
goto out;
}
- memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+ memcpy(ls->ls_recover_buf, rc_in,
+ le16_to_cpu(rc_in->rc_header.h_length));
set_bit(LSFL_RCOM_READY, &ls->ls_flags);
clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
wake_up(&ls->ls_wait_general);
@@ -336,8 +336,9 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int error, inlen, outlen, nodeid;
struct dlm_msg *msg;
- nodeid = rc_in->rc_header.h_nodeid;
- inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ inlen = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
@@ -364,7 +365,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
if (error)
goto out;
memcpy(rc->rc_buf, r->res_name, r->res_length);
- rc->rc_id = (unsigned long) r->res_id;
+ rc->rc_id = cpu_to_le64(r->res_id);
send_rcom(mh, rc);
out:
@@ -375,11 +376,12 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
- int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ int error, ret_nodeid, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ int len = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
/* Old code would send this special id to trigger a debug dump. */
- if (rc_in->rc_id == 0xFFFFFFFF) {
+ if (rc_in->rc_id == cpu_to_le64(0xFFFFFFFF)) {
log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
return;
@@ -393,7 +395,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
if (error)
ret_nodeid = error;
- rc->rc_result = ret_nodeid;
+ rc->rc_result = cpu_to_le32(ret_nodeid);
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
@@ -452,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
rl = (struct rcom_lock *) rc->rc_buf;
pack_rcom_lock(r, lkb, rl);
- rc->rc_id = (unsigned long) r;
+ rc->rc_id = cpu_to_le64((uintptr_t)r);
send_rcom(mh, rc);
out:
@@ -464,7 +466,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, nodeid = rc_in->rc_header.h_nodeid;
+ int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
dlm_recover_master_copy(ls, rc_in);
@@ -500,21 +502,20 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rc = (struct dlm_rcom *) mb;
- rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace;
- rc->rc_header.h_nodeid = dlm_our_nodeid();
- rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
rc->rc_header.h_cmd = DLM_RCOM;
- rc->rc_type = DLM_RCOM_STATUS_REPLY;
+ rc->rc_type = cpu_to_le32(DLM_RCOM_STATUS_REPLY);
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = -ESRCH;
+ rc->rc_result = cpu_to_le32(-ESRCH);
rf = (struct rcom_config *) rc->rc_buf;
rf->rf_lvblen = cpu_to_le32(~0U);
- dlm_rcom_out(rc);
dlm_midcomms_commit_mhandle(mh);
return 0;
@@ -573,27 +574,27 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
uint64_t seq;
switch (rc->rc_type) {
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
reply = 1;
break;
- case DLM_RCOM_NAMES:
+ case cpu_to_le32(DLM_RCOM_NAMES):
names = 1;
break;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
names = 1;
reply = 1;
break;
- case DLM_RCOM_LOOKUP:
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
lookup = 1;
break;
- case DLM_RCOM_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
lookup = 1;
reply = 1;
break;
- case DLM_RCOM_LOCK:
+ case cpu_to_le32(DLM_RCOM_LOCK):
lock = 1;
break;
- case DLM_RCOM_LOCK_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
lock = 1;
reply = 1;
break;
@@ -605,10 +606,10 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
- if (stop && (rc->rc_type != DLM_RCOM_STATUS))
+ if (stop && (rc->rc_type != cpu_to_le32(DLM_RCOM_STATUS)))
goto ignore;
- if (reply && (rc->rc_seq_reply != seq))
+ if (reply && (le64_to_cpu(rc->rc_seq_reply) != seq))
goto ignore;
if (!(status & DLM_RS_NODES) && (names || lookup || lock))
@@ -618,59 +619,60 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
goto ignore;
switch (rc->rc_type) {
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
receive_rcom_status(ls, rc);
break;
- case DLM_RCOM_NAMES:
+ case cpu_to_le32(DLM_RCOM_NAMES):
receive_rcom_names(ls, rc);
break;
- case DLM_RCOM_LOOKUP:
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
receive_rcom_lookup(ls, rc);
break;
- case DLM_RCOM_LOCK:
- if (rc->rc_header.h_length < lock_size)
+ case cpu_to_le32(DLM_RCOM_LOCK):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
receive_rcom_lock(ls, rc);
break;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
receive_sync_reply(ls, rc);
break;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
receive_sync_reply(ls, rc);
break;
- case DLM_RCOM_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
receive_rcom_lookup_reply(ls, rc);
break;
- case DLM_RCOM_LOCK_REPLY:
- if (rc->rc_header.h_length < lock_size)
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
dlm_recover_process_copy(ls, rc);
break;
default:
- log_error(ls, "receive_rcom bad type %d", rc->rc_type);
+ log_error(ls, "receive_rcom bad type %d",
+ le32_to_cpu(rc->rc_type));
}
return;
ignore:
log_limit(ls, "dlm_receive_rcom ignore msg %d "
"from %d %llu %llu recover seq %llu sts %x gen %u",
- rc->rc_type,
+ le32_to_cpu(rc->rc_type),
nodeid,
- (unsigned long long)rc->rc_seq,
- (unsigned long long)rc->rc_seq_reply,
+ (unsigned long long)le64_to_cpu(rc->rc_seq),
+ (unsigned long long)le64_to_cpu(rc->rc_seq_reply),
(unsigned long long)seq,
status, ls->ls_generation);
return;
Eshort:
log_error(ls, "recovery message %d from %d is too short",
- rc->rc_type, nodeid);
+ le32_to_cpu(rc->rc_type), nodeid);
}
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 8928e99dfd47..ccff1791803f 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -114,7 +114,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
if (save_slots)
dlm_slot_save(ls, rc, memb);
- if (rc->rc_result & wait_status)
+ if (le32_to_cpu(rc->rc_result) & wait_status)
break;
if (delay < 1000)
delay += 20;
@@ -141,7 +141,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
if (error)
break;
- if (rc->rc_result & wait_status)
+ if (le32_to_cpu(rc->rc_result) & wait_status)
break;
if (delay < 1000)
delay += 20;
@@ -568,14 +568,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
struct dlm_rsb *r;
int ret_nodeid, new_master;
- r = recover_idr_find(ls, rc->rc_id);
+ r = recover_idr_find(ls, le64_to_cpu(rc->rc_id));
if (!r) {
log_error(ls, "dlm_recover_master_reply no id %llx",
- (unsigned long long)rc->rc_id);
+ (unsigned long long)le64_to_cpu(rc->rc_id));
goto out;
}
- ret_nodeid = rc->rc_result;
+ ret_nodeid = le32_to_cpu(rc->rc_result);
if (ret_nodeid == dlm_our_nodeid())
new_master = 0;
@@ -732,10 +732,9 @@ void dlm_recovered_lock(struct dlm_rsb *r)
static void recover_lvb(struct dlm_rsb *r)
{
- struct dlm_lkb *lkb, *high_lkb = NULL;
+ struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
uint32_t high_seq = 0;
int lock_lvb_exists = 0;
- int big_lock_exists = 0;
int lvblen = r->res_ls->ls_lvblen;
if (!rsb_flag(r, RSB_NEW_MASTER2) &&
@@ -751,37 +750,37 @@ static void recover_lvb(struct dlm_rsb *r)
/* we are the new master, so figure out if VALNOTVALID should
be set, and set the rsb lvb from the best lkb available. */
- list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
- list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
@@ -790,7 +789,7 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
/* lvb is invalidated if only NL/CR locks remain */
- if (!big_lock_exists)
+ if (!big_lkb)
rsb_set_flag(r, RSB_VALNOTVALID);
if (!r->res_lvbptr) {
@@ -799,9 +798,9 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
}
- if (big_lock_exists) {
- r->res_lvbseq = lkb->lkb_lvbseq;
- memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+ if (big_lkb) {
+ r->res_lvbseq = big_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
} else if (high_lkb) {
r->res_lvbseq = high_lkb->lkb_lvbseq;
memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index ccb5307c21e9..036a9a0078f6 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -14,6 +14,7 @@
#include "dir.h"
#include "config.h"
#include "requestqueue.h"
+#include "util.h"
struct rq_entry {
struct list_head list;
@@ -32,7 +33,8 @@ struct rq_entry {
void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
{
struct rq_entry *e;
- int length = ms->m_header.h_length - sizeof(struct dlm_message);
+ int length = le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message);
e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
if (!e) {
@@ -42,7 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
- memcpy(&e->request, ms, ms->m_header.h_length);
+ memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
@@ -82,8 +84,10 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
log_limit(ls, "dlm_process_requestqueue msg %d from %d "
"lkid %x remid %x result %d seq %u",
- ms->m_type, ms->m_header.h_nodeid,
- ms->m_lkid, ms->m_remid, ms->m_result,
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)),
e->recover_seq);
dlm_receive_message_saved(ls, &e->request, e->recover_seq);
@@ -124,7 +128,7 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
{
- uint32_t type = ms->m_type;
+ __le32 type = ms->m_type;
/* the ls is being cleaned up and freed by release_lockspace */
if (!atomic_read(&ls->ls_count))
@@ -136,9 +140,9 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
/* directory operations are always purged because the directory is
always rebuilt during recovery and the lookups resent */
- if (type == DLM_MSG_REMOVE ||
- type == DLM_MSG_LOOKUP ||
- type == DLM_MSG_LOOKUP_REPLY)
+ if (type == cpu_to_le32(DLM_MSG_REMOVE) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP_REPLY))
return 1;
if (!dlm_no_directory(ls))
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e5cefa90b1ce..1060b24f18d4 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -108,11 +108,11 @@ static void compat_input(struct dlm_write_request *kb,
kb->i.lock.parent = kb32->i.lock.parent;
kb->i.lock.xid = kb32->i.lock.xid;
kb->i.lock.timeout = kb32->i.lock.timeout;
- kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
- kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
- kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
- kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
- kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+ kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam;
+ kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr;
+ kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam;
+ kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr;
+ kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb;
memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
}
@@ -127,9 +127,9 @@ static void compat_output(struct dlm_lock_result *res,
res32->version[1] = res->version[1];
res32->version[2] = res->version[2];
- res32->user_astaddr = (__u32)(long)res->user_astaddr;
- res32->user_astparam = (__u32)(long)res->user_astparam;
- res32->user_lksb = (__u32)(long)res->user_lksb;
+ res32->user_astaddr = (__u32)(__force long)res->user_astaddr;
+ res32->user_astparam = (__u32)(__force long)res->user_astparam;
+ res32->user_lksb = (__u32)(__force long)res->user_lksb;
res32->bast_mode = res->bast_mode;
res32->lvb_offset = res->lvb_offset;
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 58acbcc2081a..f2bc401f312f 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -20,28 +20,10 @@
#define DLM_ERRNO_ETIMEDOUT 110
#define DLM_ERRNO_EINPROGRESS 115
-void header_out(struct dlm_header *hd)
-{
- hd->h_version = cpu_to_le32(hd->h_version);
- /* does it for others u32 in union as well */
- hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace);
- hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
- hd->h_length = cpu_to_le16(hd->h_length);
-}
-
-void header_in(struct dlm_header *hd)
-{
- hd->h_version = le32_to_cpu(hd->h_version);
- /* does it for others u32 in union as well */
- hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace);
- hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
- hd->h_length = le16_to_cpu(hd->h_length);
-}
-
/* higher errno values are inconsistent across architectures, so select
one set of values for on the wire */
-static int to_dlm_errno(int err)
+int to_dlm_errno(int err)
{
switch (err) {
case -EDEADLK:
@@ -62,7 +44,7 @@ static int to_dlm_errno(int err)
return err;
}
-static int from_dlm_errno(int err)
+int from_dlm_errno(int err)
{
switch (err) {
case -DLM_ERRNO_EDEADLK:
@@ -82,73 +64,3 @@ static int from_dlm_errno(int err)
}
return err;
}
-
-void dlm_message_out(struct dlm_message *ms)
-{
- header_out(&ms->m_header);
-
- ms->m_type = cpu_to_le32(ms->m_type);
- ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
- ms->m_pid = cpu_to_le32(ms->m_pid);
- ms->m_lkid = cpu_to_le32(ms->m_lkid);
- ms->m_remid = cpu_to_le32(ms->m_remid);
- ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
- ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
- ms->m_exflags = cpu_to_le32(ms->m_exflags);
- ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
- ms->m_flags = cpu_to_le32(ms->m_flags);
- ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
- ms->m_hash = cpu_to_le32(ms->m_hash);
- ms->m_status = cpu_to_le32(ms->m_status);
- ms->m_grmode = cpu_to_le32(ms->m_grmode);
- ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
- ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
- ms->m_asts = cpu_to_le32(ms->m_asts);
- ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
-}
-
-void dlm_message_in(struct dlm_message *ms)
-{
- header_in(&ms->m_header);
-
- ms->m_type = le32_to_cpu(ms->m_type);
- ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
- ms->m_pid = le32_to_cpu(ms->m_pid);
- ms->m_lkid = le32_to_cpu(ms->m_lkid);
- ms->m_remid = le32_to_cpu(ms->m_remid);
- ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
- ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
- ms->m_exflags = le32_to_cpu(ms->m_exflags);
- ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
- ms->m_flags = le32_to_cpu(ms->m_flags);
- ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
- ms->m_hash = le32_to_cpu(ms->m_hash);
- ms->m_status = le32_to_cpu(ms->m_status);
- ms->m_grmode = le32_to_cpu(ms->m_grmode);
- ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
- ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
- ms->m_asts = le32_to_cpu(ms->m_asts);
- ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
-}
-
-void dlm_rcom_out(struct dlm_rcom *rc)
-{
- header_out(&rc->rc_header);
-
- rc->rc_type = cpu_to_le32(rc->rc_type);
- rc->rc_result = cpu_to_le32(rc->rc_result);
- rc->rc_id = cpu_to_le64(rc->rc_id);
- rc->rc_seq = cpu_to_le64(rc->rc_seq);
- rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
-}
-
-void dlm_rcom_in(struct dlm_rcom *rc)
-{
- header_in(&rc->rc_header);
-
- rc->rc_type = le32_to_cpu(rc->rc_type);
- rc->rc_result = le32_to_cpu(rc->rc_result);
- rc->rc_id = le64_to_cpu(rc->rc_id);
- rc->rc_seq = le64_to_cpu(rc->rc_seq);
- rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
-}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
index d46f23c7a6a0..b6a4b8adca8d 100644
--- a/fs/dlm/util.h
+++ b/fs/dlm/util.h
@@ -11,12 +11,8 @@
#ifndef __UTIL_DOT_H__
#define __UTIL_DOT_H__
-void dlm_message_out(struct dlm_message *ms);
-void dlm_message_in(struct dlm_message *ms);
-void dlm_rcom_out(struct dlm_rcom *rc);
-void dlm_rcom_in(struct dlm_rcom *rc);
-void header_out(struct dlm_header *hd);
-void header_in(struct dlm_header *hd);
+int to_dlm_errno(int err);
+int from_dlm_errno(int err);
#endif
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 9ad61b582f07..19af229eb7ca 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -170,16 +170,17 @@ out:
}
/**
- * ecryptfs_readpage
+ * ecryptfs_read_folio
* @file: An eCryptfs file
- * @page: Page from eCryptfs inode mapping into which to stick the read data
+ * @folio: Folio from eCryptfs inode mapping into which to stick the read data
*
- * Read in a page, decrypting if necessary.
+ * Read in a folio, decrypting if necessary.
*
* Returns zero on success; non-zero on error.
*/
-static int ecryptfs_readpage(struct file *file, struct page *page)
+static int ecryptfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
int rc = 0;
@@ -264,7 +265,7 @@ out:
*/
static int ecryptfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
@@ -272,7 +273,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t prev_page_end_size;
int rc = 0;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -549,7 +550,7 @@ const struct address_space_operations ecryptfs_aops = {
.invalidate_folio = block_invalidate_folio,
#endif
.writepage = ecryptfs_writepage,
- .readpage = ecryptfs_readpage,
+ .read_folio = ecryptfs_read_folio,
.write_begin = ecryptfs_write_begin,
.write_end = ecryptfs_write_end,
.bmap = ecryptfs_bmap,
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 89e73a6f0d36..3ba94bb005a6 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -14,16 +14,18 @@
#include "efs.h"
#include <linux/efs_fs_sb.h>
-static int efs_readpage(struct file *file, struct page *page)
+static int efs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,efs_get_block);
+ return block_read_full_folio(folio, efs_get_block);
}
+
static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,efs_get_block);
}
+
static const struct address_space_operations efs_aops = {
- .readpage = efs_readpage,
+ .read_folio = efs_read_folio,
.bmap = _efs_bmap
};
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 923eb91654d5..3b03a573cb1a 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -12,8 +12,9 @@
#include <linux/buffer_head.h>
#include "efs.h"
-static int efs_symlink_readpage(struct file *file, struct page *page)
+static int efs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
@@ -49,5 +50,5 @@ fail:
}
const struct address_space_operations efs_symlink_aops = {
- .readpage = efs_symlink_readpage
+ .read_folio = efs_symlink_read_folio
};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f57255ab88ed..85490370e0ca 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -98,3 +98,13 @@ config EROFS_FS_ZIP_LZMA
systems will be readable without selecting this option.
If unsure, say N.
+
+config EROFS_FS_ONDEMAND
+ bool "EROFS fscache-based on-demand read support"
+ depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+ default n
+ help
+ This permits EROFS to use fscache-backed data blobs with on-demand
+ read support.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 8a3317e38e5a..99bbc597a3e9 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,3 +5,4 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 780db1e5f4b7..fbb037ba326e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,6 +6,7 @@
*/
#include "internal.h"
#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
#include <linux/dax.h>
#include <trace/events/erofs.h>
@@ -35,14 +36,20 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
+ struct folio *folio;
+ unsigned int nofs_flag;
if (!page || page->index != index) {
erofs_put_metabuf(buf);
- page = read_cache_page_gfp(mapping, index,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (IS_ERR(page))
- return page;
+
+ nofs_flag = memalloc_nofs_save();
+ folio = read_cache_folio(mapping, index, NULL, NULL);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(folio))
+ return folio;
+
/* should already be PageUptodate, no need to lock page */
+ page = folio_file_page(folio, index);
buf->page = page;
}
if (buf->kmap_type == EROFS_NO_KMAP) {
@@ -63,6 +70,10 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
+ if (erofs_is_fscache_mode(sb))
+ return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
+ blkaddr, type);
+
return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
}
@@ -110,8 +121,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
return 0;
}
-static int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags)
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
{
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
@@ -199,6 +210,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ map->m_fscache = EROFS_SB(sb)->s_fscache;
if (map->m_deviceid) {
down_read(&devs->rwsem);
@@ -210,6 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
up_read(&devs->rwsem);
} else if (devs->extra_devices) {
down_read(&devs->rwsem);
@@ -227,6 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
break;
}
}
@@ -337,9 +351,9 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
-static int erofs_readpage(struct file *file, struct page *page)
+static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_readpage(page, &erofs_iomap_ops);
+ return iomap_read_folio(folio, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
@@ -385,7 +399,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0, 0);
+ NULL, 0, NULL, 0);
if (err < 0)
return err;
}
@@ -394,7 +408,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
- .readpage = erofs_readpage,
+ .read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 3efa686c7644..6dca1900c733 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -46,8 +46,6 @@ int z_erofs_load_lz4_config(struct super_block *sb,
erofs_err(sb, "too large lz4 pclusterblks %u",
sbi->lz4.max_pclusterblks);
return -EINVAL;
- } else if (sbi->lz4.max_pclusterblks >= 2) {
- erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
}
} else {
distance = le16_to_cpu(dsb->u1.lz4_max_distance);
@@ -322,6 +320,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
PAGE_SIZE - rq->pageofs_out);
+ const unsigned int lefthalf = rq->outputsize - righthalf;
unsigned char *src, *dst;
if (nrpages_out > 2) {
@@ -344,10 +343,10 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
if (nrpages_out == 2) {
DBG_BUGON(!rq->out[1]);
if (rq->out[1] == *rq->in) {
- memmove(src, src + righthalf, rq->pageofs_out);
+ memmove(src, src + righthalf, lefthalf);
} else {
dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, rq->pageofs_out);
+ memcpy(dst, src + righthalf, lefthalf);
kunmap_atomic(dst);
}
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 1238ca104f09..2b48373f690b 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -37,12 +37,9 @@
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
- union {
- u8 uuid[16]; /* used for device manager later */
- u8 userdata[64]; /* digest(sha256), etc. */
- } u;
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 tag[64]; /* digest(sha256), etc. */
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
u8 reserved[56];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
@@ -58,8 +55,8 @@ struct erofs_super_block {
__le16 root_nid; /* nid of root directory */
__le64 inos; /* total valid ino # (== f_files - f_favail) */
- __le64 build_time; /* inode v1 time derivation */
- __le32 build_time_nsec; /* inode v1 time derivation in nano scale */
+ __le64 build_time; /* compact inode time derivation */
+ __le32 build_time_nsec; /* compact inode time derivation in ns scale */
__le32 blocks; /* used for statfs */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
@@ -79,15 +76,15 @@ struct erofs_super_block {
/*
* erofs inode datalayout (i_format in on-disk inode):
- * 0 - inode plain without inline data A:
+ * 0 - uncompressed flat inode without tail-packing inline data:
* inode, [xattrs], ... | ... | no-holed data
- * 1 - inode VLE compression B (legacy):
- * inode, [xattrs], extents ... | ...
- * 2 - inode plain with inline data C:
- * inode, [xattrs], last_inline_data, ... | ... | no-holed data
- * 3 - inode compression D:
+ * 1 - compressed inode with non-compact indexes:
+ * inode, [xattrs], [map_header], extents ... | ...
+ * 2 - uncompressed flat inode with tail-packing inline data:
+ * inode, [xattrs], tailpacking data, ... | ... | no-holed data
+ * 3 - compressed inode with compact indexes:
* inode, [xattrs], map_header, extents ... | ...
- * 4 - inode chunk-based E:
+ * 4 - chunk-based inode with (optional) multi-device support:
* inode, [xattrs], chunk indexes ... | ...
* 5~7 - reserved
*/
@@ -106,7 +103,7 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
}
-/* bit definitions of inode i_advise */
+/* bit definitions of inode i_format */
#define EROFS_I_VERSION_BITS 1
#define EROFS_I_DATALAYOUT_BITS 3
@@ -140,8 +137,9 @@ struct erofs_inode_compact {
__le32 i_size;
__le32 i_reserved;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -156,9 +154,9 @@ struct erofs_inode_compact {
__le32 i_reserved2;
};
-/* 32 bytes on-disk inode */
+/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
-/* 64 bytes on-disk inode */
+/* 64-byte on-disk inode */
#define EROFS_INODE_LAYOUT_EXTENDED 1
/* 64-byte complete form of an ondisk inode */
@@ -171,8 +169,9 @@ struct erofs_inode_extended {
__le16 i_reserved;
__le64 i_size;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -365,17 +364,16 @@ enum {
struct z_erofs_vle_decompressed_index {
__le16 di_advise;
- /* where to decompress in the head cluster */
+ /* where to decompress in the head lcluster */
__le16 di_clusterofs;
union {
- /* for the head cluster */
+ /* for the HEAD lclusters */
__le32 blkaddr;
/*
- * for the rest clusters
- * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
- * [0] - pointing to the head cluster
- * [1] - pointing to the tail cluster
+ * for the NONHEAD lclusters
+ * [0] - distance to its HEAD lcluster
+ * [1] - distance to the next HEAD lcluster
*/
__le16 delta[2];
} di_u;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 000000000000..a5cc4ed2cd0d
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ */
+#include <linux/fscache.h>
+#include "internal.h"
+
+static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct netfs_io_request *rreq;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->mapping = mapping;
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ return rreq;
+}
+
+static void erofs_fscache_put_request(struct netfs_io_request *rreq)
+{
+ if (!refcount_dec_and_test(&rreq->ref))
+ return;
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+}
+
+static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+{
+ if (!refcount_dec_and_test(&subreq->ref))
+ return;
+ erofs_fscache_put_request(subreq->rreq);
+ kfree(subreq);
+}
+
+static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ erofs_fscache_put_subrequest(subreq);
+ }
+}
+
+static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ subreq_failed = (subreq->error < 0);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos =
+ (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link,
+ &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed)
+ folio_mark_uptodate(folio);
+
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+}
+
+static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+{
+ erofs_fscache_rreq_unlock_folios(rreq);
+ erofs_fscache_clear_subrequests(rreq);
+ erofs_fscache_put_request(rreq);
+}
+
+static void erofc_fscache_subreq_complete(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ subreq->error = transferred_or_error;
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ erofs_fscache_put_subrequest(subreq);
+}
+
+/*
+ * Read data from fscache and fill the read data into page cache described by
+ * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
+ * the start physical address in the cache file.
+ */
+static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
+ struct netfs_io_request *rreq, loff_t pstart)
+{
+ enum netfs_io_source source;
+ struct super_block *sb = rreq->mapping->host->i_sb;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+ loff_t start = rreq->start;
+ size_t len = rreq->len;
+ size_t done = 0;
+ int ret;
+
+ atomic_set(&rreq->nr_outstanding, 1);
+
+ ret = fscache_begin_read_operation(cres, cookie);
+ if (ret)
+ goto out;
+
+ while (done < len) {
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ refcount_inc(&rreq->ref);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ subreq->start = pstart + done;
+ subreq->len = len - done;
+ subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ source = cres->ops->prepare_read(subreq, LLONG_MAX);
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+ if (source != NETFS_READ_FROM_CACHE) {
+ erofs_err(sb, "failed to fscache prepare_read (source %d)",
+ source);
+ ret = -EIO;
+ subreq->error = ret;
+ erofs_fscache_put_subrequest(subreq);
+ goto out;
+ }
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ start + done, subreq->len);
+
+ ret = fscache_read(cres, subreq->start, &iter,
+ NETFS_READ_HOLE_FAIL,
+ erofc_fscache_subreq_complete, subreq);
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
+ if (ret) {
+ erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ goto out;
+ }
+
+ done += subreq->len;
+ }
+out:
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ return ret;
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ int ret;
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_dev mdev = {
+ .m_deviceid = 0,
+ .m_pa = folio_pos(folio),
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out;
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq))
+ goto out;
+
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa);
+out:
+ folio_unlock(folio);
+ return ret;
+}
+
+static int erofs_fscache_read_folio_inline(struct folio *folio,
+ struct erofs_map_blocks *map)
+{
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, len;
+ void *src, *dst;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(map->m_pa);
+ blknr = erofs_blknr(map->m_pa);
+ len = map->m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ dst = kmap_local_folio(folio, 0);
+ memcpy(dst, src + offset, len);
+ memset(dst + len, 0, PAGE_SIZE - len);
+ kunmap_local(dst);
+
+ erofs_put_metabuf(&buf);
+ return 0;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ struct inode *inode = folio_mapping(folio)->host;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct netfs_io_request *rreq;
+ erofs_off_t pos;
+ loff_t pstart;
+ int ret;
+
+ DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+
+ pos = folio_pos(folio);
+ map.m_la = pos;
+
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ goto out_unlock;
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_range(folio, 0, folio_size(folio));
+ goto out_uptodate;
+ }
+
+ if (map.m_flags & EROFS_MAP_META) {
+ ret = erofs_fscache_read_folio_inline(folio, &map);
+ goto out_uptodate;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out_unlock;
+
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq))
+ goto out_unlock;
+
+ pstart = mdev.m_pa + (pos - map.m_la);
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, pstart);
+
+out_uptodate:
+ if (!ret)
+ folio_mark_uptodate(folio);
+out_unlock:
+ folio_unlock(folio);
+ return ret;
+}
+
+static void erofs_fscache_advance_folios(struct readahead_control *rac,
+ size_t len, bool unlock)
+{
+ while (len) {
+ struct folio *folio = readahead_folio(rac);
+ len -= folio_size(folio);
+ if (unlock) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ }
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ size_t len, count, done = 0;
+ erofs_off_t pos;
+ loff_t start, offset;
+ int ret;
+
+ if (!readahead_count(rac))
+ return;
+
+ start = readahead_pos(rac);
+ len = readahead_length(rac);
+
+ do {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct netfs_io_request *rreq;
+
+ pos = start + done;
+ map.m_la = pos;
+
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ return;
+
+ offset = start + done;
+ count = min_t(size_t, map.m_llen - (pos - map.m_la),
+ len - done);
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
+ iov_iter_xarray(&iter, READ, &rac->mapping->i_pages,
+ offset, count);
+ iov_iter_zero(count, &iter);
+
+ erofs_fscache_advance_folios(rac, count, true);
+ ret = count;
+ continue;
+ }
+
+ if (map.m_flags & EROFS_MAP_META) {
+ struct folio *folio = readahead_folio(rac);
+
+ ret = erofs_fscache_read_folio_inline(folio, &map);
+ if (!ret) {
+ folio_mark_uptodate(folio);
+ ret = folio_size(folio);
+ }
+
+ folio_unlock(folio);
+ continue;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return;
+
+ rreq = erofs_fscache_alloc_request(rac->mapping, offset, count);
+ if (IS_ERR(rreq))
+ return;
+ /*
+ * Drop the ref of folios here. Unlock them in
+ * rreq_unlock_folios() when rreq complete.
+ */
+ erofs_fscache_advance_folios(rac, count, false);
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa + (pos - map.m_la));
+ if (!ret)
+ ret = count;
+ } while (ret > 0 && ((done += ret) < len));
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .read_folio = erofs_fscache_meta_read_folio,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+ .read_folio = erofs_fscache_read_folio,
+ .readahead = erofs_fscache_readahead,
+};
+
+int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode)
+{
+ struct fscache_volume *volume = EROFS_SB(sb)->volume;
+ struct erofs_fscache *ctx;
+ struct fscache_cookie *cookie;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
+ name, strlen(name), NULL, 0, 0);
+ if (!cookie) {
+ erofs_err(sb, "failed to get cookie for %s", name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fscache_use_cookie(cookie, false);
+ ctx->cookie = cookie;
+
+ if (need_inode) {
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
+ }
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+ ctx->inode = inode;
+ }
+
+ *fscache = ctx;
+ return 0;
+
+err_cookie:
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ ctx->cookie = NULL;
+err:
+ kfree(ctx);
+ return ret;
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+{
+ struct erofs_fscache *ctx = *fscache;
+
+ if (!ctx)
+ return;
+
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ ctx->cookie = NULL;
+
+ iput(ctx->inode);
+ ctx->inode = NULL;
+
+ kfree(ctx);
+ *fscache = NULL;
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
+
+ name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid);
+ if (!name)
+ return -ENOMEM;
+
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
+
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+ sbi->volume = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e8b37ba5e9ad..bcc8335b46b3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -8,11 +8,6 @@
#include <trace/events/erofs.h>
-/*
- * if inode is successfully read, return its inode page (or sometimes
- * the inode payload page if it's an extended inode) in order to fill
- * inline data if possible.
- */
static void *erofs_read_inode(struct erofs_buf *buf,
struct inode *inode, unsigned int *ofs)
{
@@ -297,6 +292,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
out_unlock:
erofs_put_metabuf(&buf);
@@ -370,7 +369,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5298c4ee277d..cfee49d33b95 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,6 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
+ struct erofs_fscache *fscache;
struct block_device *bdev;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -74,6 +75,7 @@ struct erofs_mount_opts {
unsigned int max_sync_decompress_pages;
#endif
unsigned int mount_opt;
+ char *fsid;
};
struct erofs_dev_context {
@@ -96,6 +98,11 @@ struct erofs_sb_lz4_info {
u16 max_pclusterblks;
};
+struct erofs_fscache {
+ struct fscache_cookie *cookie;
+ struct inode *inode;
+};
+
struct erofs_sb_info {
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
@@ -146,6 +153,10 @@ struct erofs_sb_info {
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+
+ /* fscache support */
+ struct fscache_volume *volume;
+ struct erofs_fscache *s_fscache;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -161,6 +172,11 @@ struct erofs_sb_info {
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
+
enum {
EROFS_ZIP_CACHE_DISABLED,
EROFS_ZIP_CACHE_READAHEAD,
@@ -381,31 +397,6 @@ extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
-/*
- * Logical to physical block mapping
- *
- * Different with other file systems, it is used for 2 access modes:
- *
- * 1) RAW access mode:
- *
- * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
- * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
- *
- * Note that m_lblk in the RAW access mode refers to the number of
- * the compressed ondisk block rather than the uncompressed
- * in-memory block for the compressed file.
- *
- * m_pofs equals to m_lofs except for the inline data page.
- *
- * 2) Normal access mode:
- *
- * If the inode is not compressed, it has no difference with
- * the RAW access mode. However, if the inode is compressed,
- * users should pass a valid (m_lblk, m_lofs) pair, and get
- * the needed m_pblk, m_pofs, m_len to get the compressed data
- * and the updated m_lblk, m_lofs which indicates the start
- * of the corresponding uncompressed data in the file.
- */
enum {
BH_Encoded = BH_PrivateStart,
BH_FullMapped,
@@ -467,6 +458,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
#endif /* !CONFIG_EROFS_FS_ZIP */
struct erofs_map_dev {
+ struct erofs_fscache *m_fscache;
struct block_device *m_bdev;
struct dax_device *m_daxdev;
u64 m_dax_part_off;
@@ -486,6 +478,8 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -509,7 +503,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/* namei.c */
extern const struct inode_operations erofs_dir_iops;
-int erofs_namei(struct inode *dir, struct qstr *name,
+int erofs_namei(struct inode *dir, const struct qstr *name,
erofs_nid_t *nid, unsigned int *d_type);
/* dir.c */
@@ -611,6 +605,36 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* fscache.c */
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode);
+void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache);
+
+extern const struct address_space_operations erofs_fscache_access_aops;
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+ return 0;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline int erofs_fscache_register_cookie(struct super_block *sb,
+ struct erofs_fscache **fscache,
+ char *name, bool need_inode)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+{
+}
+#endif
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 554efa363317..fd75506799c4 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -165,9 +165,8 @@ out: /* free if the candidate is valid */
return candidate;
}
-int erofs_namei(struct inode *dir,
- struct qstr *name,
- erofs_nid_t *nid, unsigned int *d_type)
+int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
+ unsigned int *d_type)
{
int ndirents;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0c4b41130c2f..95addc5c9d34 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/dax.h>
+#include <linux/exportfs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -219,7 +220,52 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
-static int erofs_init_devices(struct super_block *sb,
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+ struct erofs_device_info *dif, erofs_off_t *pos)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_deviceslot *dis;
+ struct block_device *bdev;
+ void *ptr;
+ int ret;
+
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ dis = ptr + erofs_blkoff(*pos);
+
+ if (!dif->path) {
+ if (!dis->tag[0]) {
+ erofs_err(sb, "empty device tag @ pos %llu", *pos);
+ return -EINVAL;
+ }
+ dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+ if (!dif->path)
+ return -ENOMEM;
+ }
+
+ if (erofs_is_fscache_mode(sb)) {
+ ret = erofs_fscache_register_cookie(sb, &dif->fscache,
+ dif->path, false);
+ if (ret)
+ return ret;
+ } else {
+ bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ }
+
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ *pos += EROFS_DEVT_SLOT_SIZE;
+ return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
struct erofs_super_block *dsb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -227,8 +273,6 @@ static int erofs_init_devices(struct super_block *sb,
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_device_info *dif;
- struct erofs_deviceslot *dis;
- void *ptr;
int id, err = 0;
sbi->total_blocks = sbi->primarydevice_blocks;
@@ -237,7 +281,8 @@ static int erofs_init_devices(struct super_block *sb,
else
ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
- if (ondisk_extradevs != sbi->devs->extra_devices) {
+ if (sbi->devs->extra_devices &&
+ ondisk_extradevs != sbi->devs->extra_devices) {
erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
@@ -248,30 +293,31 @@ static int erofs_init_devices(struct super_block *sb,
sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
down_read(&sbi->devs->rwsem);
- idr_for_each_entry(&sbi->devs->tree, dif, id) {
- struct block_device *bdev;
-
- ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
- EROFS_KMAP);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- break;
+ if (sbi->devs->extra_devices) {
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dis = ptr + erofs_blkoff(pos);
-
- bdev = blkdev_get_by_path(dif->path,
- FMODE_READ | FMODE_EXCL,
- sb->s_type);
- if (IS_ERR(bdev)) {
- err = PTR_ERR(bdev);
- break;
+ } else {
+ for (id = 0; id < ondisk_extradevs; id++) {
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif) {
+ err = -ENOMEM;
+ break;
+ }
+
+ err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ if (err < 0) {
+ kfree(dif);
+ break;
+ }
+ ++sbi->devs->extra_devices;
+
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
- sbi->total_blocks += dif->blocks;
- pos += EROFS_DEVT_SLOT_SIZE;
}
up_read(&sbi->devs->rwsem);
erofs_put_metabuf(&buf);
@@ -358,10 +404,12 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
/* handle multiple devices */
- ret = erofs_init_devices(sb, dsb);
+ ret = erofs_scan_devices(sb, dsb);
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+ if (erofs_is_fscache_mode(sb))
+ erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -390,6 +438,7 @@ enum {
Opt_dax,
Opt_dax_enum,
Opt_device,
+ Opt_fsid,
Opt_err
};
@@ -414,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
fsparam_string("device", Opt_device),
+ fsparam_string("fsid", Opt_fsid),
{}
};
@@ -509,6 +559,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
}
++ctx->devs->extra_devices;
break;
+ case Opt_fsid:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.fsid);
+ ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.fsid)
+ return -ENOMEM;
+#else
+ errorfc(fc, "fsid option not supported");
+#endif
+ break;
default:
return -ENOPARAM;
}
@@ -518,16 +578,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
#ifdef CONFIG_EROFS_FS_ZIP
static const struct address_space_operations managed_cache_aops;
-static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- int ret = 1; /* 0 - busy */
- struct address_space *const mapping = page->mapping;
+ bool ret = true;
+ struct address_space *const mapping = folio->mapping;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
DBG_BUGON(mapping->a_ops != &managed_cache_aops);
- if (PagePrivate(page))
- ret = erofs_try_to_free_cached_page(page);
+ if (folio_test_private(folio))
+ ret = erofs_try_to_free_cached_page(&folio->page);
return ret;
}
@@ -548,12 +608,12 @@ static void erofs_managed_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS))
+ while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
- .releasepage = erofs_managed_cache_releasepage,
+ .release_folio = erofs_managed_cache_release_folio,
.invalidate_folio = erofs_managed_cache_invalidate_folio,
};
@@ -577,6 +637,44 @@ static int erofs_init_managed_cache(struct super_block *sb)
static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif
+static struct inode *erofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ return erofs_iget(sb, ino, false);
+}
+
+static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_get_parent(struct dentry *child)
+{
+ erofs_nid_t nid;
+ unsigned int d_type;
+ int err;
+
+ err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
+ if (err)
+ return ERR_PTR(err);
+ return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR));
+}
+
+static const struct export_operations erofs_export_ops = {
+ .fh_to_dentry = erofs_fh_to_dentry,
+ .fh_to_parent = erofs_fh_to_parent,
+ .get_parent = erofs_get_parent,
+};
+
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -585,11 +683,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
-
- if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
- erofs_err(sb, "failed to set erofs blksize");
- return -EINVAL;
- }
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &erofs_sops;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -597,10 +693,36 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
+ ctx->opt.fsid = NULL;
sbi->devs = ctx->devs;
ctx->devs = NULL;
+ if (erofs_is_fscache_mode(sb)) {
+ sb->s_blocksize = EROFS_BLKSIZ;
+ sb->s_blocksize_bits = LOG_BLOCK_SIZE;
+
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+
+ err = erofs_fscache_register_cookie(sb, &sbi->s_fscache,
+ sbi->opt.fsid, true);
+ if (err)
+ return err;
+
+ err = super_setup_bdi(sb);
+ if (err)
+ return err;
+ } else {
+ if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+ erofs_err(sb, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dax_part_off);
+ }
+
err = erofs_read_superblock(sb);
if (err)
return err;
@@ -613,12 +735,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
clear_opt(&sbi->opt, DAX_ALWAYS);
}
}
- sb->s_flags |= SB_RDONLY | SB_NOATIME;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_op = &erofs_sops;
+ sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
+ sb->s_export_op = &erofs_export_ops;
if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
@@ -661,6 +781,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
static int erofs_fc_get_tree(struct fs_context *fc)
{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+
return get_tree_bdev(fc, erofs_fc_fill_super);
}
@@ -690,6 +815,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
fs_put_dax(dif->dax_dev);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ erofs_fscache_unregister_cookie(&dif->fscache);
kfree(dif->path);
kfree(dif);
return 0;
@@ -709,6 +835,7 @@ static void erofs_fc_free(struct fs_context *fc)
struct erofs_fs_context *ctx = fc->fs_private;
erofs_free_dev_context(ctx->devs);
+ kfree(ctx->opt.fsid);
kfree(ctx);
}
@@ -749,7 +876,10 @@ static void erofs_kill_sb(struct super_block *sb)
WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
- kill_block_super(sb);
+ if (erofs_is_fscache_mode(sb))
+ generic_shutdown_super(sb);
+ else
+ kill_block_super(sb);
sbi = EROFS_SB(sb);
if (!sbi)
@@ -757,6 +887,9 @@ static void erofs_kill_sb(struct super_block *sb)
erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
+ erofs_fscache_unregister_cookie(&sbi->s_fscache);
+ erofs_fscache_unregister_fs(sb);
+ kfree(sbi->opt.fsid);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -774,6 +907,7 @@ static void erofs_put_super(struct super_block *sb)
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
#endif
+ erofs_fscache_unregister_cookie(&sbi->s_fscache);
}
static struct file_system_type erofs_fs_type = {
@@ -781,7 +915,7 @@ static struct file_system_type erofs_fs_type = {
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
.kill_sb = erofs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("erofs");
@@ -857,7 +991,10 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ if (!erofs_is_fscache_mode(sb))
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
@@ -902,6 +1039,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (opt->fsid)
+ seq_printf(seq, ",fsid=%s", opt->fsid);
+#endif
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index f3babf1e6608..c1383e508bbe 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -205,8 +205,8 @@ int erofs_register_sysfs(struct super_block *sb)
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL,
- "%s", sb->s_id);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id);
if (err)
goto put_sb_kobj;
return 0;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index e6dea6dfca16..95efc127b2ba 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -791,7 +791,7 @@ err_out:
static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
unsigned int readahead_pages)
{
- /* auto: enable for readpage, disable for readahead */
+ /* auto: enable for read_folio, disable for readahead */
if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
!readahead_pages)
return true;
@@ -1488,8 +1488,9 @@ skip:
}
}
-static int z_erofs_readpage(struct file *file, struct page *page)
+static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *const inode = page->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
@@ -1563,6 +1564,6 @@ static void z_erofs_readahead(struct readahead_control *rac)
}
const struct address_space_operations z_erofs_aops = {
- .readpage = z_erofs_readpage,
+ .read_folio = z_erofs_read_folio,
.readahead = z_erofs_readahead,
};
diff --git a/fs/exec.c b/fs/exec.c
index e3e55d5e0be1..14b4b3755580 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_size;
unsigned long stack_expand;
unsigned long rlim_stack;
+ struct mmu_gather tlb;
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
@@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
vm_flags |= mm->def_flags;
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
- ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+ tlb_gather_mmu(&tlb, mm);
+ ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
vm_flags);
+ tlb_finish_mmu(&tlb);
+
if (ret)
goto out_unlock;
BUG_ON(prev != vma);
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 03f142307174..9f42f25fab92 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -148,7 +148,9 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return -EINVAL;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
@@ -166,7 +168,9 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_mount_options *opts = &sbi->options;
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index c6800b880920..4a7a2308eb72 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -203,6 +203,7 @@ struct exfat_mount_options {
/* on error: continue, panic, remount-ro */
enum exfat_error_mode errors;
unsigned utf8:1, /* Use of UTF-8 character set */
+ sys_tz:1, /* Use local timezone */
discard:1, /* Issue discard requests on deletions */
keep_last_dots:1; /* Keep trailing periods in paths */
int time_offset; /* Offset of timestamps from UTC (in minutes) */
@@ -381,6 +382,12 @@ static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
EXFAT_RESERVED_CLUSTERS;
}
+static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
+ unsigned int clus)
+{
+ return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
+}
+
/* super.c */
int exfat_set_volume_dirty(struct super_block *sb);
int exfat_clear_volume_dirty(struct super_block *sb);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index a3464e56a7e1..9de6a6b844c9 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <asm/unaligned.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -81,12 +82,6 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc,
return 0;
}
-static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
- unsigned int clus)
-{
- return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
-}
-
int exfat_ent_get(struct super_block *sb, unsigned int loc,
unsigned int *content)
{
@@ -274,10 +269,9 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
{
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct buffer_head *bhs[MAX_BUF_PER_PAGE];
- int nr_bhs = MAX_BUF_PER_PAGE;
+ struct buffer_head *bh;
sector_t blknr, last_blknr;
- int err, i, n;
+ int i;
blknr = exfat_cluster_to_sector(sbi, clu);
last_blknr = blknr + sbi->sect_per_clus;
@@ -291,30 +285,23 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
}
/* Zeroing the unused blocks on this cluster */
- while (blknr < last_blknr) {
- for (n = 0; n < nr_bhs && blknr < last_blknr; n++, blknr++) {
- bhs[n] = sb_getblk(sb, blknr);
- if (!bhs[n]) {
- err = -ENOMEM;
- goto release_bhs;
- }
- memset(bhs[n]->b_data, 0, sb->s_blocksize);
- }
-
- err = exfat_update_bhs(bhs, n, IS_DIRSYNC(dir));
- if (err)
- goto release_bhs;
+ for (i = blknr; i < last_blknr; i++) {
+ bh = sb_getblk(sb, i);
+ if (!bh)
+ return -ENOMEM;
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ brelse(bh);
}
- return 0;
-release_bhs:
- exfat_err(sb, "failed zeroed sect %llu\n", (unsigned long long)blknr);
- for (i = 0; i < n; i++)
- bforget(bhs[i]);
- return err;
+ if (IS_DIRSYNC(dir))
+ return sync_blockdev_range(sb->s_bdev,
+ EXFAT_BLK_TO_B(blknr, sb),
+ EXFAT_BLK_TO_B(last_blknr, sb) - 1);
+
+ return 0;
}
int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 2f5130059236..20d4e47f57ab 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -351,21 +351,20 @@ out:
static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
{
- struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(inode->i_sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range)))
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(inode->i_sb->s_bdev));
ret = exfat_trim_fs(inode, &range);
if (ret < 0)
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index fc0ea1684880..0133d385d8e8 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -357,9 +357,9 @@ unlock_ret:
return err;
}
-static int exfat_readpage(struct file *file, struct page *page)
+static int exfat_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, exfat_get_block);
+ return mpage_read_folio(folio, exfat_get_block);
}
static void exfat_readahead(struct readahead_control *rac)
@@ -389,13 +389,13 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
}
static int exfat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
+ loff_t pos, unsigned int len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
exfat_get_block,
&EXFAT_I(mapping->host)->i_size_ondisk);
@@ -492,7 +492,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations exfat_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = exfat_readpage,
+ .read_folio = exfat_read_folio,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
.writepages = exfat_writepages,
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index d5bd8e6d9741..9380e0188b55 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -74,6 +74,13 @@ static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off)
ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off);
}
+static inline int exfat_tz_offset(struct exfat_sb_info *sbi)
+{
+ if (sbi->options.sys_tz)
+ return -sys_tz.tz_minuteswest;
+ return sbi->options.time_offset;
+}
+
/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs)
@@ -96,8 +103,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
/* Adjust timezone to UTC0. */
exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID);
else
- /* Convert from local time to UTC using time_offset. */
- ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN;
+ ts->tv_sec -= exfat_tz_offset(sbi) * SECS_PER_MIN;
}
/* Convert linear UNIX date to a EXFAT time/date pair. */
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index a02a04a993bf..76acc3721951 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -1080,6 +1080,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
exfat_remove_entries(inode, p_dir, oldentry, 0,
num_old_entries);
+ ei->dir = *p_dir;
ei->entry = newentry;
} else {
if (exfat_get_entry_type(epold) == TYPE_FILE) {
@@ -1167,28 +1168,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
return 0;
}
-static void exfat_update_parent_info(struct exfat_inode_info *ei,
- struct inode *parent_inode)
-{
- struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb);
- struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode);
- loff_t parent_isize = i_size_read(parent_inode);
-
- /*
- * the problem that struct exfat_inode_info caches wrong parent info.
- *
- * because of flag-mismatch of ei->dir,
- * there is abnormal traversing cluster chain.
- */
- if (unlikely(parent_ei->flags != ei->dir.flags ||
- parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) ||
- parent_ei->start_clu != ei->dir.dir)) {
- exfat_chain_set(&ei->dir, parent_ei->start_clu,
- EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi),
- parent_ei->flags);
- }
-}
-
/* rename or move a old file into a new file */
static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_inode_info *ei, struct inode *new_parent_inode,
@@ -1219,8 +1198,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -ENOENT;
}
- exfat_update_parent_info(ei, old_parent_inode);
-
exfat_chain_dup(&olddir, &ei->dir);
dentry = ei->entry;
@@ -1241,8 +1218,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
goto out;
}
- exfat_update_parent_info(new_ei, new_parent_inode);
-
p_dir = &(new_ei->dir);
new_entry = new_ei->entry;
ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8ca21e7917d1..6a4dfe9f31ee 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -170,7 +170,9 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",discard");
if (opts->keep_last_dots)
seq_puts(m, ",keep_last_dots");
- if (opts->time_offset)
+ if (opts->sys_tz)
+ seq_puts(m, ",sys_tz");
+ else if (opts->time_offset)
seq_printf(m, ",time_offset=%d", opts->time_offset);
return 0;
}
@@ -214,6 +216,7 @@ enum {
Opt_errors,
Opt_discard,
Opt_keep_last_dots,
+ Opt_sys_tz,
Opt_time_offset,
/* Deprecated options */
@@ -241,6 +244,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_enum("errors", Opt_errors, exfat_param_enums),
fsparam_flag("discard", Opt_discard),
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
+ fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
@@ -298,6 +302,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_keep_last_dots:
opts->keep_last_dots = 1;
break;
+ case Opt_sys_tz:
+ opts->sys_tz = 1;
+ break;
case Opt_time_offset:
/*
* Make the limit 24 just in case someone invents something
@@ -627,13 +634,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
if (opts->allow_utime == (unsigned short)-1)
opts->allow_utime = ~opts->fs_dmask & 0022;
- if (opts->discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
- opts->discard = 0;
- }
+ if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
+ opts->discard = 0;
}
sb->s_flags |= SB_NODIRATIME;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 52377a0ee735..360ce3604a2d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -36,7 +36,6 @@
#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
-#include <linux/dax.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
@@ -875,9 +874,9 @@ static int ext2_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, ext2_get_block, wbc);
}
-static int ext2_readpage(struct file *file, struct page *page)
+static int ext2_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, ext2_get_block);
+ return mpage_read_folio(folio, ext2_get_block);
}
static void ext2_readahead(struct readahead_control *rac)
@@ -887,13 +886,11 @@ static void ext2_readahead(struct readahead_control *rac)
static int
ext2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- ext2_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block);
if (ret < 0)
ext2_write_failed(mapping, pos + len);
return ret;
@@ -913,12 +910,11 @@ static int ext2_write_end(struct file *file, struct address_space *mapping,
static int
ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
- ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+ ret = nobh_write_begin(mapping, pos, len, pagep, fsdata,
ext2_get_block);
if (ret < 0)
ext2_write_failed(mapping, pos + len);
@@ -969,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
const struct address_space_operations ext2_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = ext2_readpage,
+ .read_folio = ext2_read_folio,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
.write_begin = ext2_write_begin,
@@ -985,7 +981,7 @@ const struct address_space_operations ext2_aops = {
const struct address_space_operations ext2_nobh_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = ext2_readpage,
+ .read_folio = ext2_read_folio,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
.write_begin = ext2_nobh_write_begin,
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 7d89142e1421..72206a292676 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -17,3 +17,4 @@ ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-inode-test-objs += inode-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
+ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..e20ac0654b3f
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/quotaops.h>
+#include <linux/uuid.h>
+
+#include "ext4.h"
+#include "xattr.h"
+#include "ext4_jbd2.h"
+
+static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+ const struct fscrypt_name *src)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->usr_fname = src->usr_fname;
+ dst->disk_name = src->disk_name;
+ dst->hinfo.hash = src->hash;
+ dst->hinfo.minor_hash = src->minor_hash;
+ dst->crypto_buf = src->crypto_buf;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, iname, lookup, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
+#endif
+ return err;
+}
+
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
+#endif
+ return err;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+
+ name.crypto_buf = fname->crypto_buf;
+ fscrypt_free_filename(&name);
+
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+}
+
+static bool uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return false;
+ return true;
+}
+
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, err2;
+ handle_t *handle;
+
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto pwsalt_err_journal;
+ lock_buffer(sbi->s_sbh);
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
+}
+
+static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ handle_t *handle = fs_data;
+ int res, res2, credits, retries = 0;
+
+ /*
+ * Encrypting the root directory is not allowed because e2fsck expects
+ * lost+found to exist and be unencrypted, and encrypting the root
+ * directory would imply encrypting the lost+found directory as well as
+ * the filename "lost+found" itself.
+ */
+ if (inode->i_ino == EXT4_ROOT_INO)
+ return -EPERM;
+
+ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
+ return -EINVAL;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
+
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ }
+ return res;
+ }
+
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
+retry:
+ res = ext4_xattr_set_credits(inode, len, false /* is_create */,
+ &credits);
+ if (res)
+ return res;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (!res)
+ res = res2;
+ return res;
+}
+
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
+{
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
+}
+
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+}
+
+const struct fscrypt_operations ext4_cryptops = {
+ .key_prefix = "ext4:",
+ .get_context = ext4_get_context,
+ .set_context = ext4_set_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
+ .empty_dir = ext4_empty_dir,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index a6bb86f52b9a..3985f8c33f95 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -412,7 +412,7 @@ struct fname {
};
/*
- * This functoin implements a non-recursive way of freeing all of the
+ * This function implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
*/
static void free_rb_tree_fname(struct rb_root *root)
@@ -515,7 +515,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
/*
* This is a helper function for ext4_dx_readdir. It calls filldir
- * for all entres on the fname linked list. (Normally there is only
+ * for all entries on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
static int call_filldir(struct file *file, struct dir_context *ctx,
@@ -648,7 +648,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size;
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a743b1e3b89e..75b8d81b2469 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -673,6 +673,8 @@ enum {
/* Caller will submit data before dropping transaction handle. This
* allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Caller is in the atomic contex, find extent if it has been cached */
+#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
/*
* The bit position of these flags must not overlap with any of the
@@ -1440,12 +1442,6 @@ struct ext4_super_block {
#ifdef __KERNEL__
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
@@ -2731,74 +2727,20 @@ extern int ext4_fname_setup_ci_filename(struct inode *dir,
struct ext4_filename *fname);
#endif
+/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
-static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
- const struct fscrypt_name *src)
-{
- memset(dst, 0, sizeof(*dst));
-
- dst->usr_fname = src->usr_fname;
- dst->disk_name = src->disk_name;
- dst->hinfo.hash = src->hash;
- dst->hinfo.minor_hash = src->minor_hash;
- dst->crypto_buf = src->crypto_buf;
-}
-
-static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_setup_filename(dir, iname, lookup, &name);
- if (err)
- return err;
-
- ext4_fname_from_fscrypt_name(fname, &name);
-
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, iname, fname);
-#endif
- return err;
-}
+extern const struct fscrypt_operations ext4_cryptops;
-static inline int ext4_fname_prepare_lookup(struct inode *dir,
- struct dentry *dentry,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
- err = fscrypt_prepare_lookup(dir, dentry, &name);
- if (err)
- return err;
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname);
- ext4_fname_from_fscrypt_name(fname, &name);
+void ext4_fname_free_filename(struct ext4_filename *fname);
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
-#endif
- return err;
-}
-
-static inline void ext4_fname_free_filename(struct ext4_filename *fname)
-{
- struct fscrypt_name name;
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);
- name.crypto_buf = fname->crypto_buf;
- fscrypt_free_filename(&name);
-
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
-}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
@@ -2831,6 +2773,12 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
fname->cf_name.name = NULL;
#endif
}
+
+static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
+ void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
#endif /* !CONFIG_FS_ENCRYPTION */
/* dir.c */
@@ -3591,7 +3539,6 @@ extern int ext4_readpage_inline(struct inode *inode, struct page *page);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep);
extern int ext4_write_inline_data_end(struct inode *inode,
loff_t pos, unsigned len,
@@ -3604,7 +3551,6 @@ ext4_journalled_write_inline_data(struct inode *inode,
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e473fde6b64b..c148bb97b527 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -372,7 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
{
unsigned short entries;
ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
+ ext4_lblk_t cur = 0;
if (eh->eh_entries == 0)
return 1;
@@ -396,11 +396,11 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
- prev = lblock + ext4_ext_get_actual_len(ext) - 1;
+ cur = lblock + ext4_ext_get_actual_len(ext);
ext++;
entries--;
}
@@ -420,13 +420,13 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping index extents */
lblock = le32_to_cpu(ext_idx->ei_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_idx_pblock(ext_idx);
return 0;
}
ext_idx++;
entries--;
- prev = lblock;
+ cur = lblock + 1;
}
}
return 1;
@@ -4693,15 +4693,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
+ inode_lock(inode);
+ ret = ext4_convert_inline_data(inode);
+ inode_unlock(inode);
+ if (ret)
+ goto exit;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
ret = ext4_punch_hole(file, offset, len);
goto exit;
}
- ret = ext4_convert_inline_data(inode);
- if (ret)
- goto exit;
-
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
ret = ext4_collapse_range(file, offset, len);
goto exit;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 3d72565ec6e8..795a60ad1897 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -970,7 +970,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
@@ -1004,7 +1004,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *pos, *n;
int ret = 0;
@@ -1031,7 +1031,7 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
@@ -1093,7 +1093,7 @@ lock_and_exit:
static int ext4_fc_perform_commit(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *iter;
struct ext4_fc_head head;
@@ -1198,7 +1198,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
*/
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int nblks = 0, ret, bsize = journal->j_blocksize;
int subtid = atomic_read(&sbi->s_fc_subtid);
@@ -1659,8 +1659,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
set_nlink(inode, 1);
ext4_mark_inode_dirty(NULL, inode);
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6feb07e3e1eb..109d07629f81 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -76,7 +76,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -565,7 +565,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
- 0);
+ NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c076262770d..cff52ff6549d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -527,13 +527,13 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
}
static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
- struct inode *inode,
- unsigned flags)
+ struct inode *inode)
{
int ret, needed_blocks, no_expand;
handle_t *handle = NULL;
int retries = 0, sem_held = 0;
struct page *page = NULL;
+ unsigned int flags;
unsigned from, to;
struct ext4_iloc iloc;
@@ -562,9 +562,9 @@ retry:
/* We cannot recurse into the filesystem as the transaction is already
* started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -649,11 +649,11 @@ out:
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep)
{
int ret;
handle_t *handle;
+ unsigned int flags;
struct page *page;
struct ext4_iloc iloc;
@@ -691,9 +691,9 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
if (ret)
goto out;
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -727,8 +727,7 @@ out:
brelse(iloc.bh);
return ret;
convert:
- return ext4_convert_inline_data_to_extent(mapping,
- inode, flags);
+ return ext4_convert_inline_data_to_extent(mapping, inode);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@@ -848,13 +847,12 @@ ext4_journalled_write_inline_data(struct inode *inode,
*/
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
- unsigned flags,
void **fsdata)
{
int ret = 0, inline_size;
struct page *page;
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ page = grab_cache_page_write_begin(mapping, 0);
if (!page)
return -ENOMEM;
@@ -907,7 +905,6 @@ out:
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata)
{
@@ -916,6 +913,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct page *page;
struct ext4_iloc iloc;
int retries = 0;
+ unsigned int flags;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -932,17 +930,10 @@ retry_journal:
if (ret && ret != -ENOSPC)
goto out_journal;
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;
-
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
- flags,
fsdata);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -950,7 +941,13 @@ retry_journal:
goto out;
}
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out_journal;
@@ -1083,14 +1080,14 @@ static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
void *limit;
int de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
if (old_size) {
limit = de_buf + old_size;
do {
prev_de = de;
de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
de_buf += de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
} while (de_buf < limit);
prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
@@ -1155,7 +1152,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
* First create "." and ".." and then copy the dir information
* back to the block.
*/
- de = (struct ext4_dir_entry_2 *)target;
+ de = target;
de = ext4_init_dot_dotdot(inode, de,
inode->i_sb->s_blocksize, csum_size,
le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
@@ -2005,6 +2002,18 @@ int ext4_convert_inline_data(struct inode *inode)
if (!ext4_has_inline_data(inode)) {
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
+ } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ /*
+ * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
+ * cleared. This means we are in the middle of moving of
+ * inline data to delay allocated block. Just force writeout
+ * here to finish conversion.
+ */
+ error = filemap_flush(inode->i_mapping);
+ if (error)
+ return error;
+ if (!ext4_has_inline_data(inode))
+ return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 646ece9b3455..3dce7d058985 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,7 +41,6 @@
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>
-#include <linux/dax.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -199,8 +198,7 @@ void ext4_evict_inode(struct inode *inode)
*/
if (inode->i_ino != EXT4_JOURNAL_INO &&
ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_data.nrpages) {
+ S_ISREG(inode->i_mode) && inode->i_data.nrpages) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -545,12 +543,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
} else {
BUG();
}
+
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return retval;
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
goto found;
}
+ /*
+ * In the query cache no-wait mode, nothing we can do more if we
+ * cannot find extent in the cache.
+ */
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return 0;
/*
* Try to see if we can get the block without requesting a new
@@ -837,10 +844,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
struct ext4_map_blocks map;
struct buffer_head *bh;
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
+ bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
int err;
ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| handle != NULL || create == 0);
+ ASSERT(create == 0 || !nowait);
map.m_lblk = block;
map.m_len = 1;
@@ -851,6 +860,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (err < 0)
return ERR_PTR(err);
+ if (nowait)
+ return sb_find_get_block(inode->i_sb, map.m_pblk);
+
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -1130,7 +1142,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
#endif
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -1144,7 +1156,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- trace_ext4_write_begin(inode, pos, len, flags);
+ trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
@@ -1156,7 +1168,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- flags, pagep);
+ pagep);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1171,7 +1183,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the page (if needed) without using GFP_NOFS.
*/
retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
unlock_page(page);
@@ -2931,7 +2943,7 @@ static int ext4_nonda_switch(struct super_block *sb)
}
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret, retries = 0;
@@ -2944,18 +2956,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
- ext4_verity_in_progress(inode)) {
+ if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
- len, flags, pagep, fsdata);
+ len, pagep, fsdata);
}
*fsdata = (void *)0;
- trace_ext4_da_write_begin(inode, pos, len, flags);
+ trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode,
- pos, len, flags,
+ ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
pagep, fsdata);
if (ret < 0)
return ret;
@@ -2964,7 +2974,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -3181,8 +3191,9 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
return iomap_bmap(mapping, block, &ext4_iomap_ops);
}
-static int ext4_readpage(struct file *file, struct page *page)
+static int ext4_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
@@ -3243,19 +3254,19 @@ static void ext4_journalled_invalidate_folio(struct folio *folio,
WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
-static int ext4_releasepage(struct page *page, gfp_t wait)
+static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_releasepage(page);
+ trace_ext4_releasepage(&folio->page);
/* Page has dirty journalled data -> cannot release */
- if (PageChecked(page))
- return 0;
+ if (folio_test_checked(folio))
+ return false;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page);
+ return jbd2_journal_try_to_free_buffers(journal, folio);
else
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
}
static bool ext4_inode_datasync_dirty(struct inode *inode)
@@ -3609,7 +3620,7 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
}
static const struct address_space_operations ext4_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3618,7 +3629,7 @@ static const struct address_space_operations ext4_aops = {
.dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3627,7 +3638,7 @@ static const struct address_space_operations ext4_aops = {
};
static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3636,7 +3647,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_journalled_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -3644,7 +3655,7 @@ static const struct address_space_operations ext4_journalled_aops = {
};
static const struct address_space_operations ext4_da_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3653,7 +3664,7 @@ static const struct address_space_operations ext4_da_aops = {
.dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3967,15 +3978,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- if (ext4_has_inline_data(inode)) {
- filemap_invalidate_lock(mapping);
- ret = ext4_convert_inline_data(inode);
- filemap_invalidate_unlock(mapping);
- if (ret)
- return ret;
- }
-
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4991,7 +4993,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
- ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -4999,9 +5000,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext4_symlink_inode_operations;
- ext4_set_aops(inode);
}
- inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -5398,6 +5397,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
+ loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
@@ -5469,6 +5469,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode->i_sb->s_blocksize_bits);
down_write(&EXT4_I(inode)->i_data_sem);
+ old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
if (!error)
@@ -5480,6 +5481,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
*/
if (!error)
i_size_write(inode, attr->ia_size);
+ else
+ EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ba44fa1be70a..cb01c1da0f9d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/random.h>
-#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
@@ -504,18 +503,6 @@ journal_err_out:
return err;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int uuid_is_zero(__u8 u[16])
-{
- int i;
-
- for (i = 0; i < 16; i++)
- if (u[i])
- return 0;
- return 1;
-}
-#endif
-
/*
* If immutable is set and we are not clearing it, we're not allowed to change
* anything else in the inode. Don't error out if we're only trying to set
@@ -1044,7 +1031,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
__u32 flags = 0;
unsigned int flush_flags = 0;
struct super_block *sb = file_inode(filp)->i_sb;
- struct request_queue *q;
if (copy_from_user(&flags, (__u32 __user *)arg,
sizeof(__u32)))
@@ -1065,10 +1051,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
return -EINVAL;
- q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
- if (!q)
- return -ENXIO;
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
return -EOPNOTSUPP;
if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
@@ -1393,14 +1377,13 @@ resizefs_out:
case FITRIM:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
/*
@@ -1432,51 +1415,9 @@ resizefs_out:
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case FS_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_FS_ENCRYPTION
- int err, err2;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- handle_t *handle;
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);
- if (!ext4_has_feature_encrypt(sb))
- return -EOPNOTSUPP;
- if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
- err = mnt_want_write_file(filp);
- if (err)
- return err;
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto pwsalt_err_exit;
- }
- err = ext4_journal_get_write_access(handle, sb,
- sbi->s_sbh,
- EXT4_JTR_NONE);
- if (err)
- goto pwsalt_err_journal;
- lock_buffer(sbi->s_sbh);
- generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
- err = ext4_handle_dirty_metadata(handle, NULL,
- sbi->s_sbh);
- pwsalt_err_journal:
- err2 = ext4_journal_stop(handle);
- if (err2 && !err)
- err = err2;
- pwsalt_err_exit:
- mnt_drop_write_file(filp);
- if (err)
- return err;
- }
- if (copy_to_user((void __user *) arg,
- sbi->s_es->s_encrypt_pw_salt, 16))
- return -EFAULT;
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
- }
case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 252c168454c7..9f12f29bc346 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -695,13 +695,10 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < max; i++) {
if (mb_test_bit(i, buddy)) {
- /* only single bit in buddy2 may be 1 */
+ /* only single bit in buddy2 may be 0 */
if (!mb_test_bit(i << 1, buddy2)) {
MB_CHECK_ASSERT(
mb_test_bit((i<<1)+1, buddy2));
- } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
- MB_CHECK_ASSERT(
- mb_test_bit(i << 1, buddy2));
}
continue;
}
@@ -2919,7 +2916,7 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
{
- struct super_block *sb = (struct super_block *)seq->private;
+ struct super_block *sb = seq->private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
seq_puts(seq, "mballoc:\n");
@@ -3498,7 +3495,7 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
- if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+ if (bdev_nonrot(sb->s_bdev))
sbi->s_mb_max_linear_groups = 0;
else
sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
@@ -3629,7 +3626,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
return __blkdev_issue_discard(sb->s_bdev,
(sector_t)discard_block << (sb->s_blocksize_bits - 9),
(sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, 0, biop);
+ GFP_NOFS, biop);
} else
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
@@ -6398,6 +6395,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
+ * @set_trimmed: set the trimmed flag if at least one block is trimmed
*
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
@@ -6407,7 +6405,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+ ext4_grpblk_t minblocks, bool set_trimmed)
{
struct ext4_buddy e4b;
int ret;
@@ -6426,7 +6424,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
- if (ret >= 0)
+ if (ret >= 0 && set_trimmed)
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
} else {
ret = 0;
@@ -6455,7 +6453,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -6463,6 +6461,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+ bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -6475,14 +6474,16 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
range->len < sb->s_blocksize)
return -EINVAL;
/* No point to try to trim less than discard granularity */
- if (range->minlen < q->limits.discard_granularity) {
+ if (range->minlen < discard_granularity) {
minlen = EXT4_NUM_B2C(EXT4_SB(sb),
- q->limits.discard_granularity >> sb->s_blocksize_bits);
+ discard_granularity >> sb->s_blocksize_bits);
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks)
+ if (end >= max_blks - 1) {
end = max_blks - 1;
+ eof = true;
+ }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -6496,6 +6497,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ whole_group = true;
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
@@ -6512,12 +6514,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group)
+ if (group == last_group) {
end = last_cluster;
-
+ whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, whole_group);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index cebea4270817..79d05e464c43 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -127,7 +127,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
*/
static int kmmpd(void *data)
{
- struct super_block *sb = (struct super_block *) data;
+ struct super_block *sb = data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 95aa212f0863..701f1d6a217f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
@@ -127,7 +128,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
pgoff_t index1, pgoff_t index2, struct page *page[2])
{
struct address_space *mapping[2];
- unsigned fl = AOP_FLAG_NOFS;
+ unsigned int flags;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -139,11 +140,15 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
mapping[1] = inode1->i_mapping;
}
- page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
- if (!page[0])
+ flags = memalloc_nofs_save();
+ page[0] = grab_cache_page_write_begin(mapping[0], index1);
+ if (!page[0]) {
+ memalloc_nofs_restore(flags);
return -ENOMEM;
+ }
- page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
+ page[1] = grab_cache_page_write_begin(mapping[1], index2);
+ memalloc_nofs_restore(flags);
if (!page[1]) {
unlock_page(page[0]);
put_page(page[0]);
@@ -664,8 +669,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
* Up semaphore to avoid following problems:
* a. transaction deadlock among ext4_journal_start,
* ->write_begin via pagefault, and jbd2_journal_commit
- * b. racing with ->readpage, ->write_begin, and ext4_get_block
- * in move_extent_per_page
+ * b. racing with ->read_folio, ->write_begin, and
+ * ext4_get_block in move_extent_per_page
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 767b4bfe39c3..47d0ca4c795b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -277,9 +277,9 @@ static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
- struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
char *to, struct dx_map_entry *offsets,
@@ -777,12 +777,14 @@ static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
- unsigned count, indirect;
+ unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ ext4_lblk_t block;
+ ext4_lblk_t blocks[EXT4_HTREE_LEVEL];
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
@@ -854,6 +856,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
dxtrace(printk("Look up %x", hash));
+ level = 0;
+ blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
@@ -882,15 +886,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+
+ block = dx_get_block(at);
+ for (i = 0; i <= level; i++) {
+ if (blocks[i] == block) {
+ ext4_warning_inode(dir,
+ "dx entry: tree cycle block %u points back to block %u",
+ blocks[level], block);
+ goto fail;
+ }
+ }
+ if (++level > indirect)
return frame;
+ blocks[level] = block;
frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ frame->bh = ext4_read_dirblock(dir, block, INDEX);
if (IS_ERR(frame->bh)) {
ret_err = (struct dx_frame *) frame->bh;
frame->bh = NULL;
goto fail;
}
+
entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit(dir)) {
@@ -1249,15 +1265,23 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
- char *base = (char *) de;
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
+ unsigned int buflen = bh->b_size;
+ char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + blocksize) {
+ if (ext4_has_metadata_csum(dir->i_sb))
+ buflen -= sizeof(struct ext4_dir_entry_tail);
+
+ while ((char *) de < base + buflen) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
+ ((char *)de) - base))
+ return -EFSCORRUPTED;
if (de->name_len && de->inode) {
if (ext4_hash_in_dirent(dir))
h.hash = EXT4_DIRENT_HASH(de);
@@ -1270,8 +1294,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
count++;
cond_resched();
}
- /* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de, blocksize);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
}
return count;
}
@@ -1943,8 +1966,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
- blocksize, hinfo, map);
+ count = dx_make_map(dir, *bh, hinfo, map);
+ if (count < 0) {
+ err = count;
+ goto journal_error;
+ }
map -= count;
dx_sort_map(map, count);
/* Ensure that neither split block is over half full */
@@ -2031,7 +2057,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -2587,7 +2613,7 @@ int ext4_generic_delete_entry(struct inode *dir,
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *)entry_buf;
+ de = entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
entry_buf, buf_size, i))
@@ -3249,6 +3275,32 @@ out_trace:
return retval;
}
+static int ext4_init_symlink_block(handle_t *handle, struct inode *inode,
+ struct fscrypt_str *disk_link)
+{
+ struct buffer_head *bh;
+ char *kaddr;
+ int err = 0;
+
+ bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ kaddr = (char *)bh->b_data;
+ memcpy(kaddr, disk_link->name, disk_link->len);
+ inode->i_size = disk_link->len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+out:
+ brelse(bh);
+ return err;
+}
+
static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, const char *symname)
{
@@ -3257,6 +3309,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
int err, len = strlen(symname);
int credits;
struct fscrypt_str disk_link;
+ int retries = 0;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
@@ -3270,26 +3323,15 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- /*
- * For non-fast symlinks, we just allocate inode and put it on
- * orphan list in the first transaction => we need bitmap,
- * group descriptor, sb, inode block, quota blocks, and
- * possibly selinux xattr blocks.
- */
- credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- EXT4_XATTR_TRANS_BLOCKS;
- } else {
- /*
- * Fast symlink. We have to add entry to directory
- * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
- * allocate new inode (bitmap, group descriptor, inode block,
- * quota blocks, sb is already counted in previous macros).
- */
- credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
- }
-
+ /*
+ * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the
+ * directory. +3 for inode, inode bitmap, group descriptor allocation.
+ * EXT4_DATA_TRANS_BLOCKS for the data block allocation and
+ * modification.
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+retry:
inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
@@ -3297,7 +3339,8 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- return PTR_ERR(inode);
+ err = PTR_ERR(inode);
+ goto out_retry;
}
if (IS_ENCRYPTED(inode)) {
@@ -3305,75 +3348,44 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto err_drop_inode;
inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ } else {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ } else {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ inode->i_link = (char *)&EXT4_I(inode)->i_data;
+ }
}
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- if (!IS_ENCRYPTED(inode))
- inode->i_op = &ext4_symlink_inode_operations;
- inode_nohighmem(inode);
- ext4_set_aops(inode);
- /*
- * We cannot call page_symlink() with transaction started
- * because it calls into ext4_write_begin() which can wait
- * for transaction commit if we are running out of space
- * and thus we deadlock. So we have to stop transaction now
- * and restart it when symlink contents is written.
- *
- * To keep fs consistent in case of crash, we have to put inode
- * to orphan list in the mean time.
- */
- drop_nlink(inode);
- err = ext4_orphan_add(handle, inode);
- if (handle)
- ext4_journal_stop(handle);
- handle = NULL;
- if (err)
- goto err_drop_inode;
- err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
- if (err)
- goto err_drop_inode;
- /*
- * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
- * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
- */
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- handle = NULL;
- goto err_drop_inode;
- }
- set_nlink(inode, 1);
- err = ext4_orphan_del(handle, inode);
+ /* alloc symlink block and fill it */
+ err = ext4_init_symlink_block(handle, inode, &disk_link);
if (err)
goto err_drop_inode;
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- if (!IS_ENCRYPTED(inode)) {
- inode->i_op = &ext4_fast_symlink_inode_operations;
- inode->i_link = (char *)&EXT4_I(inode)->i_data;
- }
memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
disk_link.len);
inode->i_size = disk_link.len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
}
- EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
- if (inode)
- iput(inode);
- goto out_free_encrypted_link;
+ iput(inode);
+ goto out_retry;
err_drop_inode:
- if (handle)
- ext4_journal_stop(handle);
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
+ if (handle)
+ ext4_journal_stop(handle);
iput(inode);
-out_free_encrypted_link:
+out_retry:
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
if (disk_link.name != (unsigned char *)symname)
kfree(disk_link.name);
return err;
@@ -3455,6 +3467,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
+ struct ext4_dir_entry_2 *de;
+ unsigned int offset;
+
/* The first directory block must not be a hole, so
* treat it as DIRENT_HTREE
*/
@@ -3463,9 +3478,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
*retval = PTR_ERR(bh);
return NULL;
}
- *parent_de = ext4_next_entry(
- (struct ext4_dir_entry_2 *)bh->b_data,
- inode->i_sb->s_blocksize);
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino ||
+ strcmp(".", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '.'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len,
+ inode->i_sb->s_blocksize);
+ de = ext4_next_entry(de, inode->i_sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '..'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ *parent_de = de;
+
return bh;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index af491e170c4a..e02a5f14e021 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -163,7 +163,7 @@ static bool bio_post_read_required(struct bio *bio)
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -394,7 +394,7 @@ int ext4_mpage_readpages(struct inode *inode,
bio = NULL;
}
if (!PageUptodate(page))
- block_read_full_page(page, ext4_get_block);
+ block_read_full_folio(page_folio(page), ext4_get_block);
else
unlock_page(page);
next_page:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1466fbdbc8e3..450c918d68fc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1211,6 +1211,9 @@ static void ext4_put_super(struct super_block *sb)
*/
ext4_unregister_sysfs(sb);
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+
ext4_unregister_li_request(sb);
ext4_quota_off_umount(sb);
@@ -1397,7 +1400,7 @@ static void ext4_destroy_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+ struct ext4_inode_info *ei = foo;
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
@@ -1492,128 +1495,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode)
return ext4_write_inode(inode, &wbc);
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
-{
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
-}
-
-static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
- void *fs_data)
-{
- handle_t *handle = fs_data;
- int res, res2, credits, retries = 0;
-
- /*
- * Encrypting the root directory is not allowed because e2fsck expects
- * lost+found to exist and be unencrypted, and encrypting the root
- * directory would imply encrypting the lost+found directory as well as
- * the filename "lost+found" itself.
- */
- if (inode->i_ino == EXT4_ROOT_INO)
- return -EPERM;
-
- if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
- return -EINVAL;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
- return -EOPNOTSUPP;
-
- res = ext4_convert_inline_data(inode);
- if (res)
- return res;
-
- /*
- * If a journal handle was specified, then the encryption context is
- * being set on a new inode via inheritance and is part of a larger
- * transaction to create the inode. Otherwise the encryption context is
- * being set on an existing inode in its own transaction. Only in the
- * latter case should the "retry on ENOSPC" logic be used.
- */
-
- if (handle) {
- res = ext4_xattr_set_handle(handle, inode,
- EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- ext4_clear_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode, false);
- }
- return res;
- }
-
- res = dquot_initialize(inode);
- if (res)
- return res;
-retry:
- res = ext4_xattr_set_credits(inode, len, false /* is_create */,
- &credits);
- if (res)
- return res;
-
- handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode, false);
- res = ext4_mark_inode_dirty(handle, inode);
- if (res)
- EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
- }
- res2 = ext4_journal_stop(handle);
-
- if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- if (!res)
- res = res2;
- return res;
-}
-
-static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
-{
- return EXT4_SB(sb)->s_dummy_enc_policy.policy;
-}
-
-static bool ext4_has_stable_inodes(struct super_block *sb)
-{
- return ext4_has_feature_stable_inodes(sb);
-}
-
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
-static const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
- .get_context = ext4_get_context,
- .set_context = ext4_set_context,
- .get_dummy_policy = ext4_get_dummy_policy,
- .empty_dir = ext4_empty_dir,
- .has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
-};
-#endif
-
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
@@ -1867,7 +1748,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
};
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
static const char deprecated_msg[] =
"Mount option \"%s\" will be removed by %s\n"
@@ -1913,6 +1793,7 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_NO_EXT2},
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
@@ -2427,11 +2308,12 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size,
GFP_KERNEL);
+ return 0;
#else
ext4_msg(NULL, KERN_WARNING,
- "Test dummy encryption mount option ignored");
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
#endif
- return 0;
case Opt_dax:
case Opt_dax_type:
#ifdef CONFIG_FS_DAX
@@ -2625,8 +2507,10 @@ parse_failed:
ret = ext4_apply_options(fc, sb);
out_free:
- kfree(s_ctx);
- kfree(fc);
+ if (fc) {
+ ext4_fc_free(fc);
+ kfree(fc);
+ }
kfree(s_mount_opts);
return ret;
}
@@ -2786,12 +2670,44 @@ err_jquota_specified:
#endif
}
+static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ const struct ext4_fs_context *ctx = fc->fs_private;
+ const struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION))
+ return 0;
+
+ if (!ext4_has_feature_encrypt(sb)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption requires encrypt feature");
+ return -EINVAL;
+ }
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ !sbi->s_dummy_enc_policy.policy) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Can't set test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
+ return 0;
+}
+
static int ext4_check_opt_consistency(struct fs_context *fc,
struct super_block *sb)
{
struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_sb_info *sbi = fc->s_fs_info;
int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ int err;
if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
ext4_msg(NULL, KERN_ERR,
@@ -2821,20 +2737,9 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
"for blocksize < PAGE_SIZE");
}
-#ifdef CONFIG_FS_ENCRYPTION
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) &&
- is_remount && !sbi->s_dummy_enc_policy.policy) {
- ext4_msg(NULL, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
- return -1;
- }
-#endif
+ err = ext4_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
if (!sbi->s_journal) {
@@ -3837,7 +3742,7 @@ static struct task_struct *ext4_lazyinit_task;
*/
static int ext4_lazyinit_thread(void *arg)
{
- struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct ext4_lazy_init *eli = arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
unsigned long next_wakeup, cur;
@@ -4409,7 +4314,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
int silent = fc->sb_flags & SB_SILENT;
/* Set defaults for the variables that will be set during parsing */
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -4886,7 +4792,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -5279,12 +5185,6 @@ no_journal:
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
- !ext4_has_feature_encrypt(sb)) {
- ext4_set_feature_encrypt(sb);
- ext4_commit_super(sb);
- }
-
/*
* Get the # of file system overhead blocks from the
* superblock if present.
@@ -5474,13 +5374,9 @@ no_journal:
goto failed_mount9;
}
- if (test_opt(sb, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- ext4_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -6276,7 +6172,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
char *to_free[EXT4_MAXQUOTAS];
#endif
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
/* Store the original options */
old_sb_flags = sb->s_flags;
@@ -6302,9 +6197,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
} else
old_opts.s_qf_names[i] = NULL;
#endif
- if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- ctx->journal_ioprio =
- sbi->s_journal->j_task->io_context->ioprio;
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ ctx->journal_ioprio =
+ sbi->s_journal->j_task->io_context->ioprio;
+ else
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ }
ext4_apply_options(fc, sb);
@@ -6445,7 +6345,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (err)
goto restore_opts;
}
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_mount_state = (le16_to_cpu(es->s_state) &
+ ~EXT4_FC_REPLAY);
err = ext4_setup_super(sb, es, 0);
if (err)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 69109746e6e2..d281f5bcc526 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *cpage = NULL;
+ struct buffer_head *bh = NULL;
const void *caddr;
unsigned int max_size;
const char *paddr;
@@ -39,16 +39,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
caddr = EXT4_I(inode)->i_data;
max_size = sizeof(EXT4_I(inode)->i_data);
} else {
- cpage = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(cpage))
- return ERR_CAST(cpage);
- caddr = page_address(cpage);
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ caddr = bh->b_data;
max_size = inode->i_sb->s_blocksize;
}
paddr = fscrypt_get_symlink(inode, caddr, max_size, done);
- if (cpage)
- put_page(cpage);
+ brelse(bh);
return paddr;
}
@@ -62,6 +65,38 @@ static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
return fscrypt_symlink_getattr(path, stat);
}
+static void ext4_free_link(void *bh)
+{
+ brelse(bh);
+}
+
+static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct buffer_head *bh;
+
+ if (!dentry) {
+ bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh || !ext4_buffer_uptodate(bh))
+ return ERR_PTR(-ECHILD);
+ } else {
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ }
+
+ set_delayed_call(callback, ext4_free_link, bh);
+ nd_terminate_link(bh->b_data, inode->i_size,
+ inode->i_sb->s_blocksize - 1);
+ return bh->b_data;
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
@@ -70,7 +105,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
};
const struct inode_operations ext4_symlink_inode_operations = {
- .get_link = page_get_link,
+ .get_link = ext4_get_link,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index eacbd489e3bf..b051d19b5c8a 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -69,6 +69,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
@@ -79,15 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
void *fsdata;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
memcpy_to_page(page, offset_in_page(pos), buf, n);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
return res;
if (res != n)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 909085a78f9c..456c1e89386a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -468,7 +468,7 @@ const struct address_space_operations f2fs_meta_aops = {
.writepages = f2fs_write_meta_pages,
.dirty_folio = f2fs_dirty_meta_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
#endif
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 12a56f9e1572..24824cd96f36 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1746,7 +1746,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
}
const struct address_space_operations f2fs_compress_aops = {
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
.invalidate_folio = f2fs_invalidate_folio,
};
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9a1a526f2092..8f38c26bb16c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2372,8 +2372,9 @@ next_page:
return ret;
}
-static int f2fs_read_data_page(struct file *file, struct page *page)
+static int f2fs_read_data_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page_file_mapping(page)->host;
int ret = -EAGAIN;
@@ -3314,8 +3315,7 @@ unlock_out:
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3325,7 +3325,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
block_t blkaddr = NULL_ADDR;
int err = 0;
- trace_f2fs_write_begin(inode, pos, len, flags);
+ trace_f2fs_write_begin(inode, pos, len);
if (!f2fs_is_checkpoint_ready(sbi)) {
err = -ENOSPC;
@@ -3528,28 +3528,30 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
folio_detach_private(folio);
}
-int f2fs_release_page(struct page *page, gfp_t wait)
+bool f2fs_release_folio(struct folio *folio, gfp_t wait)
{
- /* If this is dirty page, keep PagePrivate */
- if (PageDirty(page))
- return 0;
+ struct f2fs_sb_info *sbi;
+
+ /* If this is dirty folio, keep private data */
+ if (folio_test_dirty(folio))
+ return false;
/* This is atomic written page, keep Private */
- if (page_private_atomic(page))
- return 0;
+ if (page_private_atomic(&folio->page))
+ return false;
- if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
- struct inode *inode = page->mapping->host;
+ sbi = F2FS_M_SB(folio->mapping);
+ if (test_opt(sbi, COMPRESS_CACHE)) {
+ struct inode *inode = folio->mapping->host;
- if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode)))
- clear_page_private_data(page);
+ if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ clear_page_private_data(&folio->page);
}
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
- detach_page_private(page);
- set_page_private(page, 0);
- return 1;
+ folio_detach_private(folio);
+ return true;
}
static bool f2fs_dirty_data_folio(struct address_space *mapping,
@@ -3936,7 +3938,7 @@ static void f2fs_swap_deactivate(struct file *file)
#endif
const struct address_space_operations f2fs_dblock_aops = {
- .readpage = f2fs_read_data_page,
+ .read_folio = f2fs_read_data_folio,
.readahead = f2fs_readahead,
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
@@ -3944,7 +3946,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.write_end = f2fs_write_end,
.dirty_folio = f2fs_dirty_data_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
.direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
.swap_activate = f2fs_swap_activate,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8c570de21ed5..10d1f138d14f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,6 +18,7 @@
#include <linux/kobject.h>
#include <linux/sched.h>
#include <linux/cred.h>
+#include <linux/sched/mm.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -2654,6 +2655,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
pgoff_t index, bool for_write)
{
struct page *page;
+ unsigned int flags;
if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) {
if (!for_write)
@@ -2673,7 +2675,12 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
if (!for_write)
return grab_cache_page(mapping, index);
- return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, index);
+ memalloc_nofs_restore(flags);
+
+ return page;
}
static inline struct page *f2fs_pagecache_get_page(
@@ -3761,7 +3768,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
int compr_blocks, bool allow_balance);
void f2fs_write_failed(struct inode *inode, loff_t to);
void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
-int f2fs_release_page(struct page *page, gfp_t wait);
+bool f2fs_release_folio(struct folio *folio, gfp_t wait);
#ifdef CONFIG_MIGRATION
int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
@@ -4372,8 +4379,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
static inline bool f2fs_bdev_support_discard(struct block_device *bdev)
{
- return blk_queue_discard(bdev_get_queue(bdev)) ||
- bdev_is_zoned(bdev);
+ return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev);
}
static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5b89af0f27f0..100637b1adb3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2285,7 +2285,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret;
@@ -2304,7 +2303,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
return ret;
range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
mnt_drop_write_file(filp);
if (ret < 0)
@@ -3686,18 +3685,18 @@ out:
static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
pgoff_t off, block_t block, block_t len, u32 flags)
{
- struct request_queue *q = bdev_get_queue(bdev);
sector_t sector = SECTOR_FROM_BLOCK(block);
sector_t nr_sects = SECTOR_FROM_BLOCK(len);
int ret = 0;
- if (!q)
- return -ENXIO;
-
- if (flags & F2FS_TRIM_FILE_DISCARD)
- ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
- blk_queue_secure_erase(q) ?
- BLKDEV_DISCARD_SECURE : 0);
+ if (flags & F2FS_TRIM_FILE_DISCARD) {
+ if (bdev_max_secure_erase_sectors(bdev))
+ ret = blkdev_issue_secure_erase(bdev, sector, nr_sects,
+ GFP_NOFS);
+ else
+ ret = blkdev_issue_discard(bdev, sector, nr_sects,
+ GFP_NOFS);
+ }
if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
if (IS_ENCRYPTED(inode))
@@ -4309,7 +4308,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
inc_page_count(sbi, F2FS_DIO_READ);
dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
- &f2fs_iomap_dio_read_ops, 0, 0);
+ &f2fs_iomap_dio_read_ops, 0, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret != -EIOCBQUEUED)
@@ -4527,7 +4526,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (pos + count > inode->i_size)
dio_flags |= IOMAP_DIO_FORCE_WAIT;
dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
- &f2fs_iomap_dio_write_ops, dio_flags, 0);
+ &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret == -ENOTBLK)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c45d341dcf6e..8ccff18560ff 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2165,7 +2165,7 @@ const struct address_space_operations f2fs_node_aops = {
.writepages = f2fs_write_node_pages,
.dirty_folio = f2fs_dirty_node_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
#endif
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bd9731cdec56..7225ce09f3ab 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1196,9 +1196,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
unsigned int *issued)
{
struct block_device *bdev = dc->bdev;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
&(dcc->fstrim_list) : &(dcc->wait_list);
@@ -1245,7 +1244,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
submit:
if (err) {
spin_lock_irqsave(&dc->lock, flags);
@@ -1375,9 +1374,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
struct discard_cmd *dc;
struct discard_info di = {0};
struct rb_node **insert_p = NULL, *insert_parent = NULL;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
block_t end = lstart + len;
dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4368f90571bd..ed3e8b7a8260 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2483,7 +2483,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
tocopy = min_t(unsigned long, sb->s_blocksize - offset,
towrite);
retry:
- err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
+ err = a_ops->write_begin(NULL, mapping, off, tocopy,
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 3d793202cc9f..65395ae188aa 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -74,6 +74,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
@@ -85,8 +88,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
void *addr;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
@@ -94,8 +96,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
memcpy(addr + offset_in_page(pos), buf, n);
kunmap_atomic(addr);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
return res;
if (res != n)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 8f5218450a3a..3dae3ed60f3a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -127,13 +127,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
struct super_block *sb = inode->i_sb;
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
@@ -141,7 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
err = fat_trim_fs(inode, &range);
if (err < 0)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 32e721b88a49..a38238d75c08 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -205,9 +205,9 @@ static int fat_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, fat_get_block);
}
-static int fat_readpage(struct file *file, struct page *page)
+static int fat_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, fat_get_block);
+ return mpage_read_folio(folio, fat_get_block);
}
static void fat_readahead(struct readahead_control *rac)
@@ -226,13 +226,13 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
static int fat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int err;
*pagep = NULL;
- err = cont_write_begin(file, mapping, pos, len, flags,
+ err = cont_write_begin(file, mapping, pos, len,
pagep, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
if (err < 0)
@@ -344,7 +344,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations fat_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = fat_readpage,
+ .read_folio = fat_read_folio,
.readahead = fat_readahead,
.writepage = fat_writepage,
.writepages = fat_writepages,
@@ -1875,13 +1875,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
goto out_fail;
}
- if (sbi->options.discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- fat_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev))
+ fat_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
fat_set_state(sb, 1, 0);
return 0;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f15d885b9796..34a3faa4886d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
- if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
- if (!filp->f_mapping || !filp->f_mapping->a_ops ||
- !filp->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if (!S_ISFIFO(inode->i_mode) &&
+ (arg & O_DIRECT) &&
+ !(filp->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index bfc780c682fb..a37431e443d3 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -38,33 +38,34 @@
#include "vxfs_inode.h"
-static int vxfs_immed_readpage(struct file *, struct page *);
+static int vxfs_immed_read_folio(struct file *, struct folio *);
/*
* Address space operations for immed files and directories.
*/
const struct address_space_operations vxfs_immed_aops = {
- .readpage = vxfs_immed_readpage,
+ .read_folio = vxfs_immed_read_folio,
};
/**
- * vxfs_immed_readpage - read part of an immed inode into pagecache
+ * vxfs_immed_read_folio - read part of an immed inode into pagecache
* @file: file context (unused)
- * @page: page frame to fill in.
+ * @folio: folio to fill in.
*
* Description:
- * vxfs_immed_readpage reads a part of the immed area of the
+ * vxfs_immed_read_folio reads a part of the immed area of the
* file that hosts @pp into the pagecache.
*
* Returns:
* Zero on success, else a negative error code.
*
* Locking status:
- * @page is locked and will be unlocked.
+ * @folio is locked and will be unlocked.
*/
static int
-vxfs_immed_readpage(struct file *fp, struct page *pp)
+vxfs_immed_read_folio(struct file *fp, struct folio *folio)
{
+ struct page *pp = &folio->page;
struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT;
caddr_t kaddr;
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index e806694d4145..6143ebab940d 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -38,11 +38,11 @@
#include "vxfs_extern.h"
-static int vxfs_readpage(struct file *, struct page *);
+static int vxfs_read_folio(struct file *, struct folio *);
static sector_t vxfs_bmap(struct address_space *, sector_t);
const struct address_space_operations vxfs_aops = {
- .readpage = vxfs_readpage,
+ .read_folio = vxfs_read_folio,
.bmap = vxfs_bmap,
};
@@ -141,24 +141,23 @@ vxfs_getblk(struct inode *ip, sector_t iblock,
}
/**
- * vxfs_readpage - read one page synchronously into the pagecache
+ * vxfs_read_folio - read one page synchronously into the pagecache
* @file: file context (unused)
- * @page: page frame to fill in.
+ * @folio: folio to fill in.
*
* Description:
- * The vxfs_readpage routine reads @page synchronously into the
+ * The vxfs_read_folio routine reads @folio synchronously into the
* pagecache.
*
* Returns:
* Zero on success, else a negative error code.
*
* Locking status:
- * @page is locked and will be unlocked.
+ * @folio is locked and will be unlocked.
*/
-static int
-vxfs_readpage(struct file *file, struct page *page)
+static int vxfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, vxfs_getblk);
+ return block_read_full_folio(folio, vxfs_getblk);
}
/**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591fe9cf1659..a21d8f1a56d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
* incorrectly attributed).
*
* To resolve this issue, cgroup writeback detects the majority dirtier of
- * an inode and transfers the ownership to it. To avoid unnnecessary
+ * an inode and transfers the ownership to it. To avoid unnecessary
* oscillation, the detection mechanism keeps track of history and gives
* out the switch verdict only if the foreign usage pattern is stable over
* a certain amount of time and/or writeback attempts.
@@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED) &&
+ (inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1775,11 +1779,12 @@ static long writeback_sb_inodes(struct super_block *sb,
};
unsigned long start_time = jiffies;
long write_chunk;
- long wrote = 0; /* count both pages and inodes */
+ long total_wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
+ long wrote;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1855,7 +1860,9 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
+ wrote = wrote < 0 ? 0 : wrote;
+ total_wrote += wrote;
if (need_resched()) {
/*
@@ -1877,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb,
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
- wrote++;
+ total_wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1891,14 +1898,14 @@ static long writeback_sb_inodes(struct super_block *sb,
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
- if (wrote) {
+ if (total_wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
- return wrote;
+ return total_wrote;
}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9ff27b8a9782..74303d6e987b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1957,20 +1957,20 @@ void fuse_init_dir(struct inode *inode)
fi->rdc.version = 0;
}
-static int fuse_symlink_readpage(struct file *null, struct page *page)
+static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(page->mapping->host, page);
+ int err = fuse_readlink_page(folio->mapping->host, &folio->page);
if (!err)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
static const struct address_space_operations fuse_symlink_aops = {
- .readpage = fuse_symlink_readpage,
+ .read_folio = fuse_symlink_read_folio,
};
void fuse_init_symlink(struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f18d14d5fea1..05caa2b9272e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -857,8 +857,9 @@ static int fuse_do_readpage(struct file *file, struct page *page)
return 0;
}
-static int fuse_readpage(struct file *file, struct page *page)
+static int fuse_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
int err;
@@ -1174,7 +1175,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
break;
err = -ENOMEM;
- page = grab_cache_page_write_begin(mapping, index, 0);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
break;
@@ -2273,8 +2274,7 @@ out:
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
@@ -2284,7 +2284,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
WARN_ON(!fc->writeback_cache);
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto error;
@@ -3175,7 +3175,7 @@ static const struct file_operations fuse_file_operations = {
};
static const struct address_space_operations fuse_file_aops = {
- .readpage = fuse_readpage,
+ .read_folio = fuse_read_folio,
.readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9f31ce724..106e90a36583 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -464,22 +464,26 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return 0;
}
-
-static int __gfs2_readpage(void *file, struct page *page)
+/**
+ * gfs2_read_folio - read a folio from a file
+ * @file: The file to read
+ * @folio: The folio in the file
+ */
+static int gfs2_read_folio(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
if (!gfs2_is_jdata(ip) ||
- (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) {
- error = iomap_readpage(page, &gfs2_iomap_ops);
+ (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
+ error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, page);
- unlock_page(page);
+ error = stuffed_readpage(ip, &folio->page);
+ folio_unlock(folio);
} else {
- error = mpage_readpage(page, gfs2_block_map);
+ error = mpage_read_folio(folio, gfs2_block_map);
}
if (unlikely(gfs2_withdrawn(sdp)))
@@ -489,17 +493,6 @@ static int __gfs2_readpage(void *file, struct page *page)
}
/**
- * gfs2_readpage - read a page of a file
- * @file: The file to read
- * @page: The page of the file
- */
-
-static int gfs2_readpage(struct file *file, struct page *page)
-{
- return __gfs2_readpage(file, page);
-}
-
-/**
* gfs2_internal_read - read an internal file
* @ip: The gfs2 inode
* @buf: The buffer to fill
@@ -523,7 +516,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
amt = size - copied;
if (offset + size > PAGE_SIZE)
amt = PAGE_SIZE - offset;
- page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+ page = read_cache_page(mapping, index, gfs2_read_folio, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
p = kmap_atomic(page);
@@ -698,38 +691,40 @@ out:
}
/**
- * gfs2_releasepage - free the metadata associated with a page
- * @page: the page that's being released
+ * gfs2_release_folio - free the metadata associated with a folio
+ * @folio: the folio that's being released
* @gfp_mask: passed from Linux VFS, ignored by us
*
- * Calls try_to_free_buffers() to free the buffers and put the page if the
+ * Calls try_to_free_buffers() to free the buffers and put the folio if the
* buffers can be released.
*
- * Returns: 1 if the page was put or else 0
+ * Returns: true if the folio was put or else false
*/
-int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct buffer_head *bh, *head;
struct gfs2_bufdata *bd;
- if (!page_has_buffers(page))
- return 0;
+ head = folio_buffers(folio);
+ if (!head)
+ return false;
/*
- * From xfs_vm_releasepage: mm accommodates an old ext3 case where
- * clean pages might not have had the dirty bit cleared. Thus, it can
- * send actual dirty pages to ->releasepage() via shrink_active_list().
+ * mm accommodates an old ext3 case where clean folios might
+ * not have had the dirty bit cleared. Thus, it can send actual
+ * dirty folios to ->release_folio() via shrink_active_list().
*
- * As a workaround, we skip pages that contain dirty buffers below.
- * Once ->releasepage isn't called on dirty pages anymore, we can warn
- * on dirty buffers like we used to here again.
+ * As a workaround, we skip folios that contain dirty buffers
+ * below. Once ->release_folio isn't called on dirty folios
+ * anymore, we can warn on dirty buffers like we used to here
+ * again.
*/
gfs2_log_lock(sdp);
- head = bh = page_buffers(page);
+ bh = head;
do {
if (atomic_read(&bh->b_count))
goto cannot_release;
@@ -739,9 +734,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh)))
goto cannot_release;
bh = bh->b_this_page;
- } while(bh != head);
+ } while (bh != head);
- head = bh = page_buffers(page);
+ bh = head;
do {
bd = bh->b_private;
if (bd) {
@@ -762,20 +757,20 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
} while (bh != head);
gfs2_log_unlock(sdp);
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
cannot_release:
gfs2_log_unlock(sdp);
- return 0;
+ return false;
}
static const struct address_space_operations gfs2_aops = {
.writepage = gfs2_writepage,
.writepages = gfs2_writepages,
- .readpage = gfs2_readpage,
+ .read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
@@ -787,12 +782,12 @@ static const struct address_space_operations gfs2_aops = {
static const struct address_space_operations gfs2_jdata_aops = {
.writepage = gfs2_jdata_writepage,
.writepages = gfs2_jdata_writepages,
- .readpage = gfs2_readpage,
+ .read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
.dirty_folio = jdata_dirty_folio,
.bmap = gfs2_bmap,
.invalidate_folio = gfs2_invalidate_folio,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 39080b2d6cf8..b6697333bb2b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
- loff_t blockmask = i_blocksize(inode) - 1;
- loff_t end = (pos + length) & ~blockmask;
+ loff_t hstart = round_up(pos + written, i_blocksize(inode));
+ loff_t hend = iomap->offset + iomap->length;
- pos = (pos + written + blockmask) & ~blockmask;
- if (pos < end) {
- truncate_pagecache_range(inode, pos, end - 1);
- punch_hole(ip, pos, end - pos);
+ if (hstart < hend) {
+ truncate_pagecache_range(inode, hstart, hend - 1);
+ punch_hole(ip, hstart, hend - hstart);
}
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 48f01323c37c..2cceb193dcd8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
-static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+static inline bool should_fault_in_pages(struct iov_iter *i,
+ struct kiocb *iocb,
size_t *prev_count,
size_t *window_size)
{
size_t count = iov_iter_count(i);
size_t size, offs;
- if (likely(!count))
- return false;
- if (ret <= 0 && ret != -EFAULT)
+ if (!count)
return false;
if (!iter_is_iovec(i))
return false;
size = PAGE_SIZE;
- offs = offset_in_page(i->iov[0].iov_base + i->iov_offset);
+ offs = offset_in_page(iocb->ki_pos);
if (*prev_count != count || !*window_size) {
size_t nr_dirtied;
- size = ALIGN(offs + count, PAGE_SIZE);
- size = min_t(size_t, size, SZ_1M);
nr_dirtied = max(current->nr_dirtied_pause -
current->nr_dirtied, 8);
- size = min(size, nr_dirtied << PAGE_SHIFT);
+ size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT);
}
*prev_count = count;
@@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -835,35 +832,33 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, written);
+ IOMAP_DIO_PARTIAL, NULL, read);
to->nofault = false;
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+ /* No increment (+=) because iomap_dio_rw returns a cumulative value. */
if (ret > 0)
- written = ret;
-
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
+ read = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
+ /* User space doesn't expect partial success. */
if (ret < 0)
return ret;
- return written;
+ return read;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -873,7 +868,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
size_t prev_count = 0, window_size = 0;
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -901,39 +896,37 @@ retry:
goto out_uninit;
/* Silently fall back to buffered I/O when writing beyond EOF */
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
- goto out;
-retry_under_glock:
+ goto out_unlock;
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, read);
+ IOMAP_DIO_PARTIAL, NULL, written);
from->nofault = false;
-
- if (ret == -ENOTBLK)
- ret = 0;
+ if (ret <= 0) {
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EFAULT)
+ goto out_unlock;
+ }
+ /* No increment (+=) because iomap_dio_rw returns a cumulative value. */
if (ret > 0)
- read = ret;
-
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
+ written = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (window_size)
goto retry;
- }
}
-out:
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
+ /* User space doesn't expect partial success. */
if (ret < 0)
return ret;
- return read;
+ return written;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -941,7 +934,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct gfs2_inode *ip;
struct gfs2_holder gh;
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -962,7 +955,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
- written = ret;
+ read = ret;
} else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
@@ -975,30 +968,26 @@ retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
ret = generic_file_read_iter(iocb, to);
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
if (ret > 0)
- written += ret;
-
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
+ read += ret;
- gfs2_holder_allow_demote(&gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(&gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(&gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(&gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(&gh))
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- return written ? written : ret;
+ return read ? read : ret;
}
static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
@@ -1012,7 +1001,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
struct gfs2_holder *statfs_gh = NULL;
size_t prev_count = 0, window_size = 0;
size_t orig_count = iov_iter_count(from);
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -1030,10 +1019,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
retry:
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (!window_size) {
+ ret = -EFAULT;
+ goto out_uninit;
+ }
+ from->count = min(from->count, window_size);
+ }
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
+
if (inode == sdp->sd_rindex) {
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -1050,25 +1047,19 @@ retry_under_glock:
current->backing_dev_info = NULL;
if (ret > 0) {
iocb->ki_pos += ret;
- read += ret;
+ written += ret;
}
if (inode == sdp->sd_rindex)
gfs2_glock_dq_uninit(statfs_gh);
- from->count = orig_count - read;
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
-
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- from->count = min(from->count, window_size - leftover);
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
- goto retry;
- }
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+
+ from->count = orig_count - written;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ goto retry;
}
out_unlock:
if (gfs2_holder_queued(gh))
@@ -1077,8 +1068,8 @@ out_uninit:
gfs2_holder_uninit(gh);
if (statfs_gh)
kfree(statfs_gh);
- from->count = orig_count - read;
- return read ? read : ret;
+ from->count = orig_count - written;
+ return written ? written : ret;
}
/**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 630c6550eacf..c992d53013d3 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -127,9 +127,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
kfree(gl->gl_lksb.sb_lvbptr);
- if (gl->gl_ops->go_flags & GLOF_ASPACE)
- kmem_cache_free(gfs2_glock_aspace_cachep, gl);
- else
+ if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ container_of(gl, struct gfs2_glock_aspace, glock);
+ kmem_cache_free(gfs2_glock_aspace_cachep, gla);
+ } else
kmem_cache_free(gfs2_glock_cachep, gl);
}
@@ -1159,7 +1161,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
.ln_sbd = sdp };
struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
- struct kmem_cache *cachep;
int ret = 0;
gl = find_insert_glock(&name, NULL);
@@ -1170,20 +1171,24 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
if (!create)
return -ENOENT;
- if (glops->go_flags & GLOF_ASPACE)
- cachep = gfs2_glock_aspace_cachep;
- else
- cachep = gfs2_glock_cachep;
- gl = kmem_cache_alloc(cachep, GFP_NOFS);
- if (!gl)
- return -ENOMEM;
-
+ if (glops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS);
+ if (!gla)
+ return -ENOMEM;
+ gl = &gla->glock;
+ } else {
+ gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS);
+ if (!gl)
+ return -ENOMEM;
+ }
memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+ gl->gl_ops = glops;
if (glops->go_flags & GLOF_LVB) {
gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
if (!gl->gl_lksb.sb_lvbptr) {
- kmem_cache_free(cachep, gl);
+ gfs2_glock_dealloc(&gl->gl_rcu);
return -ENOMEM;
}
}
@@ -1197,7 +1202,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
- gl->gl_ops = glops;
gl->gl_dstamp = 0;
preempt_disable();
/* We use the global stats to estimate the initial per-glock stats */
@@ -1234,8 +1238,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
*glp = tmp;
out_free:
- kfree(gl->gl_lksb.sb_lvbptr);
- kmem_cache_free(cachep, gl);
+ gfs2_glock_dealloc(&gl->gl_rcu);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 4f8642301801..c0ae9100a0bc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -138,6 +138,11 @@ struct lm_lockops {
const match_table_t *lm_tokens;
};
+struct gfs2_glock_aspace {
+ struct gfs2_glock glock;
+ struct address_space mapping;
+};
+
extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -179,8 +184,11 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
{
- if (gl->gl_ops->go_flags & GLOF_ASPACE)
- return (struct address_space *)(gl + 1);
+ if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ container_of(gl, struct gfs2_glock_aspace, glock);
+ return &gla->mapping;
+ }
return NULL;
}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 7b2c1f390db7..0264d514dda7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -12,7 +12,7 @@
#include <linux/mm.h>
#include "util.h"
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask);
extern int gfs2_internal_read(struct gfs2_inode *ip,
char *buf, loff_t *pos, unsigned size);
extern void gfs2_set_aops(struct inode *inode);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 28d0eb23e18e..244187e3e70f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -62,11 +62,10 @@ static void gfs2_init_glock_once(void *foo)
static void gfs2_init_gl_aspace_once(void *foo)
{
- struct gfs2_glock *gl = foo;
- struct address_space *mapping = (struct address_space *)(gl + 1);
+ struct gfs2_glock_aspace *gla = foo;
- gfs2_init_glock_once(gl);
- address_space_init_once(mapping);
+ gfs2_init_glock_once(&gla->glock);
+ address_space_init_once(&gla->mapping);
}
/**
@@ -104,8 +103,7 @@ static int __init init_gfs2_fs(void)
goto fail_cachep1;
gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
- sizeof(struct gfs2_glock) +
- sizeof(struct address_space),
+ sizeof(struct gfs2_glock_aspace),
0, 0, gfs2_init_gl_aspace_once);
if (!gfs2_glock_aspace_cachep)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index d8bd1d48bd78..868dcc71b581 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -92,14 +92,14 @@ const struct address_space_operations gfs2_meta_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
};
const struct address_space_operations gfs2_rgrp_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
};
/**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 21880d72081a..d0a58cdd433a 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,9 +40,11 @@ extern const struct address_space_operations gfs2_rgrp_aops;
static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
- if (mapping->a_ops == &gfs2_meta_aops)
- return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
- else if (mapping->a_ops == &gfs2_rgrp_aops)
+ if (mapping->a_ops == &gfs2_meta_aops) {
+ struct gfs2_glock_aspace *gla =
+ container_of(mapping, struct gfs2_glock_aspace, mapping);
+ return gla->glock.gl_name.ln_sbd;
+ } else if (mapping->a_ops == &gfs2_rgrp_aops)
return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
return inode->i_sb->s_fs_info;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be0997e24d60..59d727a4ae2c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -365,11 +365,12 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
- struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+ struct inode *inode = sdp->sd_qc_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
unsigned int block, offset;
struct buffer_head *bh;
+ struct iomap iomap = { };
int error;
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
mutex_lock(&sdp->sd_quota_mutex);
@@ -381,11 +382,17 @@ static int bh_get(struct gfs2_quota_data *qd)
block = qd->qd_slot / sdp->sd_qc_per_block;
offset = qd->qd_slot % sdp->sd_qc_per_block;
- bh_map.b_size = BIT(ip->i_inode.i_blkbits);
- error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
+ error = gfs2_iomap_get(inode,
+ (loff_t)block << inode->i_blkbits,
+ i_blocksize(inode), &iomap);
if (error)
goto fail;
- error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
+ error = -ENOENT;
+ if (iomap.type != IOMAP_MAPPED)
+ goto fail;
+
+ error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits,
+ DIO_WAIT, 0, &bh);
if (error)
goto fail;
error = -EIO;
@@ -443,9 +450,8 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
{
- struct gfs2_quota_data *qd = NULL;
+ struct gfs2_quota_data *qd = NULL, *iter;
int error;
- int found = 0;
*qdp = NULL;
@@ -454,15 +460,13 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
spin_lock(&qd_lock);
- list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
- if (found)
+ list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) {
+ if (qd_check_sync(sdp, iter, &sdp->sd_quota_sync_gen)) {
+ qd = iter;
break;
+ }
}
- if (!found)
- qd = NULL;
-
spin_unlock(&qd_lock);
if (qd) {
@@ -531,34 +535,42 @@ static void qdsb_put(struct gfs2_quota_data *qd)
*/
int gfs2_qa_get(struct gfs2_inode *ip)
{
- int error = 0;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- down_write(&ip->i_rw_mutex);
+ spin_lock(&inode->i_lock);
if (ip->i_qadata == NULL) {
- ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
- if (!ip->i_qadata) {
- error = -ENOMEM;
- goto out;
- }
+ struct gfs2_qadata *tmp;
+
+ spin_unlock(&inode->i_lock);
+ tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&inode->i_lock);
+ if (ip->i_qadata == NULL)
+ ip->i_qadata = tmp;
+ else
+ kmem_cache_free(gfs2_qadata_cachep, tmp);
}
ip->i_qadata->qa_ref++;
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ spin_unlock(&inode->i_lock);
+ return 0;
}
void gfs2_qa_put(struct gfs2_inode *ip)
{
- down_write(&ip->i_rw_mutex);
+ struct inode *inode = &ip->i_inode;
+
+ spin_lock(&inode->i_lock);
if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) {
kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
ip->i_qadata = NULL;
}
- up_write(&ip->i_rw_mutex);
+ spin_unlock(&inode->i_lock);
}
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 016ed1b2ca1d..2bb085a72e8e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -55,17 +55,16 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
struct list_head *head = &jd->jd_revoke_list;
- struct gfs2_revoke_replay *rr;
- int found = 0;
+ struct gfs2_revoke_replay *rr = NULL, *iter;
- list_for_each_entry(rr, head, rr_list) {
- if (rr->rr_blkno == blkno) {
- found = 1;
+ list_for_each_entry(iter, head, rr_list) {
+ if (iter->rr_blkno == blkno) {
+ rr = iter;
break;
}
}
- if (found) {
+ if (rr) {
rr->rr_where = where;
return 0;
}
@@ -83,18 +82,17 @@ int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
- struct gfs2_revoke_replay *rr;
+ struct gfs2_revoke_replay *rr = NULL, *iter;
int wrap, a, b, revoke;
- int found = 0;
- list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
- if (rr->rr_blkno == blkno) {
- found = 1;
+ list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) {
+ if (iter->rr_blkno == blkno) {
+ rr = iter;
break;
}
}
- if (!found)
+ if (!rr)
return 0;
wrap = (rr->rr_where < jd->jd_replay_tail);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 801ad9f4f2be..8a63870eef5a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1315,7 +1315,7 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
u64 blk;
sector_t start = 0;
sector_t nr_blks = 0;
- int rv;
+ int rv = -EIO;
unsigned int x;
u32 trimmed = 0;
u8 diff;
@@ -1371,7 +1371,7 @@ fail:
if (sdp->sd_args.ar_discard)
fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
sdp->sd_args.ar_discard = 0;
- return -EIO;
+ return rv;
}
/**
@@ -1386,7 +1386,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
{
struct inode *inode = file_inode(filp);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
+ struct block_device *bdev = sdp->sd_vfs->s_bdev;
struct buffer_head *bh;
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd *rgd_end;
@@ -1405,7 +1405,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
return -EROFS;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (copy_from_user(&r, argp, sizeof(r)))
@@ -1418,8 +1418,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
- minlen = max_t(u64, minlen,
- q->limits.discard_granularity) >> bs_shift;
+ minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
return -EINVAL;
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 263d5028d9d1..3f7e9bef9874 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -491,10 +491,10 @@ void hfs_file_truncate(struct inode *inode)
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
- &page, &fsdata);
+ res = hfs_write_begin(NULL, mapping, size + 1, 0, &page,
+ &fsdata);
if (!res) {
- res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
+ res = generic_write_end(NULL, mapping, size + 1, 0, 0,
page, fsdata);
}
if (res)
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index b8eb0322a3e5..68d0305880f7 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -201,6 +201,8 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
+int hfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata);
extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
extern int hfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 55f45e9b4930..c4526f16355d 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -34,9 +34,9 @@ static int hfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, hfs_get_block, wbc);
}
-static int hfs_readpage(struct file *file, struct page *page)
+static int hfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, hfs_get_block);
+ return block_read_full_folio(folio, hfs_get_block);
}
static void hfs_write_failed(struct address_space *mapping, loff_t to)
@@ -49,14 +49,13 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int hfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+int hfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -70,14 +69,15 @@ static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfs_get_block);
}
-static int hfs_releasepage(struct page *page, gfp_t mask)
+static bool hfs_release_folio(struct folio *folio, gfp_t mask)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct hfs_btree *tree;
struct hfs_bnode *node;
u32 nidx;
- int i, res = 1;
+ int i;
+ bool res = true;
switch (inode->i_ino) {
case HFS_EXT_CNID:
@@ -88,27 +88,27 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
break;
default:
BUG();
- return 0;
+ return false;
}
if (!tree)
- return 0;
+ return false;
if (tree->node_size >= PAGE_SIZE) {
- nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT);
+ nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
;
else if (atomic_read(&node->refcnt))
- res = 0;
+ res = false;
if (res && node) {
hfs_bnode_unhash(node);
hfs_bnode_free(node);
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index << (PAGE_SHIFT - tree->node_size_shift);
+ nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift);
i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
@@ -116,7 +116,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
if (!node)
continue;
if (atomic_read(&node->refcnt)) {
- res = 0;
+ res = false;
break;
}
hfs_bnode_unhash(node);
@@ -124,7 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
} while (--i && nidx < tree->node_count);
spin_unlock(&tree->hash_lock);
}
- return res ? try_to_free_buffers(page) : 0;
+ return res ? try_to_free_buffers(folio) : false;
}
static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -161,18 +161,18 @@ static int hfs_writepages(struct address_space *mapping,
const struct address_space_operations hfs_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfs_readpage,
+ .read_folio = hfs_read_folio,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
.bmap = hfs_bmap,
- .releasepage = hfs_releasepage,
+ .release_folio = hfs_release_folio,
};
const struct address_space_operations hfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfs_readpage,
+ .read_folio = hfs_read_folio,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 7054a542689f..721f779b4ec3 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -557,12 +557,12 @@ void hfsplus_file_truncate(struct inode *inode)
void *fsdata;
loff_t size = inode->i_size;
- res = pagecache_write_begin(NULL, mapping, size, 0, 0,
- &page, &fsdata);
+ res = hfsplus_write_begin(NULL, mapping, size, 0,
+ &page, &fsdata);
if (res)
return;
- res = pagecache_write_end(NULL, mapping, size,
- 0, 0, page, fsdata);
+ res = generic_write_end(NULL, mapping, size, 0, 0,
+ page, fsdata);
if (res < 0)
return;
mark_inode_dirty(inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 1798949f269b..396e73aa0961 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -468,6 +468,8 @@ extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
+int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 446a816aa8e1..aeab83ed1c9c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -23,9 +23,9 @@
#include "hfsplus_raw.h"
#include "xattr.h"
-static int hfsplus_readpage(struct file *file, struct page *page)
+static int hfsplus_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, hfsplus_get_block);
+ return block_read_full_folio(folio, hfsplus_get_block);
}
static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
@@ -43,14 +43,13 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -64,14 +63,15 @@ static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfsplus_get_block);
}
-static int hfsplus_releasepage(struct page *page, gfp_t mask)
+static bool hfsplus_release_folio(struct folio *folio, gfp_t mask)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct hfs_btree *tree;
struct hfs_bnode *node;
u32 nidx;
- int i, res = 1;
+ int i;
+ bool res = true;
switch (inode->i_ino) {
case HFSPLUS_EXT_CNID:
@@ -85,26 +85,26 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
break;
default:
BUG();
- return 0;
+ return false;
}
if (!tree)
- return 0;
+ return false;
if (tree->node_size >= PAGE_SIZE) {
- nidx = page->index >>
+ nidx = folio->index >>
(tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
;
else if (atomic_read(&node->refcnt))
- res = 0;
+ res = false;
if (res && node) {
hfs_bnode_unhash(node);
hfs_bnode_free(node);
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index <<
+ nidx = folio->index <<
(PAGE_SHIFT - tree->node_size_shift);
i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
@@ -113,7 +113,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
if (!node)
continue;
if (atomic_read(&node->refcnt)) {
- res = 0;
+ res = false;
break;
}
hfs_bnode_unhash(node);
@@ -121,7 +121,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
} while (--i && nidx < tree->node_count);
spin_unlock(&tree->hash_lock);
}
- return res ? try_to_free_buffers(page) : 0;
+ return res ? try_to_free_buffers(folio) : false;
}
static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -158,18 +158,18 @@ static int hfsplus_writepages(struct address_space *mapping,
const struct address_space_operations hfsplus_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfsplus_readpage,
+ .read_folio = hfsplus_read_folio,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
.bmap = hfsplus_bmap,
- .releasepage = hfsplus_releasepage,
+ .release_folio = hfsplus_release_folio,
};
const struct address_space_operations hfsplus_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfsplus_readpage,
+ .read_folio = hfsplus_read_folio,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 14f9ac973a2e..cc1bc6f93a01 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -434,8 +434,9 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
return err;
}
-static int hostfs_readpage(struct file *file, struct page *page)
+static int hostfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *buffer;
loff_t start = page_offset(page);
int bytes_read, ret = 0;
@@ -463,12 +464,12 @@ static int hostfs_readpage(struct file *file, struct page *page)
}
static int hostfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
- *pagep = grab_cache_page_write_begin(mapping, index, flags);
+ *pagep = grab_cache_page_write_begin(mapping, index);
if (!*pagep)
return -ENOMEM;
return 0;
@@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
- .readpage = hostfs_readpage,
+ .read_folio = hostfs_read_folio,
.dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 99493a23c5d0..f7547a62c81f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -158,9 +158,9 @@ static const struct iomap_ops hpfs_iomap_ops = {
.iomap_begin = hpfs_iomap_begin,
};
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, hpfs_get_block);
+ return mpage_read_folio(folio, hpfs_get_block);
}
static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -194,13 +194,13 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
}
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -247,7 +247,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
const struct address_space_operations hpfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hpfs_readpage,
+ .read_folio = hpfs_read_folio,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
.writepages = hpfs_writepages,
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d73f8a67168e..15fc63276caa 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -479,8 +479,9 @@ out:
return err;
}
-static int hpfs_symlink_readpage(struct file *file, struct page *page)
+static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
@@ -508,7 +509,7 @@ fail:
}
const struct address_space_operations hpfs_symlink_aops = {
- .readpage = hpfs_symlink_readpage
+ .read_folio = hpfs_symlink_read_folio
};
static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd3a088db11d..22d2b94339ed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,7 +383,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
static int hugetlbfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
return -EINVAL;
@@ -405,7 +405,8 @@ static void remove_huge_page(struct page *page)
}
static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+ zap_flags_t zap_flags)
{
struct vm_area_struct *vma;
@@ -439,7 +440,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
}
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL);
+ NULL, zap_flags);
}
}
@@ -517,7 +518,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vmdelete_list(&mapping->i_mmap,
index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
+ (index + 1) * pages_per_huge_page(h),
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
}
@@ -583,7 +585,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_mmap_lock_write(mapping);
i_size_write(inode, offset);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
}
@@ -616,8 +619,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT, 0);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, hole_start, hole_end);
inode_unlock(inode);
@@ -1048,12 +1051,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (sbinfo->spool) {
long free_pages;
- spin_lock(&sbinfo->spool->lock);
+ spin_lock_irq(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
- spin_unlock(&sbinfo->spool->lock);
+ spin_unlock_irq(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
diff --git a/fs/internal.h b/fs/internal.h
index 08503dc68d2b..9a6c233ee7f1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -191,3 +191,32 @@ long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);
+
+/*
+ * fs/xattr.c:
+ */
+struct xattr_name {
+ char name[XATTR_NAME_MAX + 1];
+};
+
+struct xattr_ctx {
+ /* Value of attribute */
+ union {
+ const void __user *cvalue;
+ void __user *value;
+ };
+ void *kvalue;
+ size_t size;
+ /* Attribute name */
+ struct xattr_name *kname;
+ unsigned int flags;
+};
+
+
+ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ struct dentry *d,
+ struct xattr_ctx *ctx);
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 32aeb2c581c5..824623bcf1a5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
wake_up_process(worker->task);
return false;
}
@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
{
if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL;
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
return true;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..ba6eee76d028 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
+ int cancel_seq;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7625b29153b9..9f1c682d7caf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -80,6 +80,7 @@
#include <linux/io_uring.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/xattr.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -94,7 +95,7 @@
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 15)
+#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -113,6 +114,11 @@
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
+#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
+ IO_REQ_CLEAN_FLAGS)
+
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
@@ -166,7 +172,7 @@ struct io_rings {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 sq_flags;
+ atomic_t sq_flags;
/*
* Runtime CQ flags
*
@@ -198,13 +204,6 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
-enum io_uring_cmd_flags {
- IO_URING_F_COMPLETE_DEFER = 1,
- IO_URING_F_UNLOCKED = 2,
- /* int's last bit, sign checks are usually faster than a bit test */
- IO_URING_F_NONBLOCK = INT_MIN,
-};
-
struct io_mapped_ubuf {
u64 ubuf;
u64 ubuf_end;
@@ -216,10 +215,27 @@ struct io_mapped_ubuf {
struct io_ring_ctx;
struct io_overflow_cqe {
- struct io_uring_cqe cqe;
struct list_head list;
+ struct io_uring_cqe cqe;
};
+/*
+ * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
+ * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
+ * can't safely always dereference the file when the task has exited and ring
+ * cleanup is done. If a file is tracked and part of SCM, then unix gc on
+ * process exit may reap it before __io_sqe_files_unregister() is run.
+ */
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#if defined(CONFIG_64BIT)
+#define FFS_SCM 0x4UL
+#else
+#define IO_URING_SCM_ALL
+#define FFS_SCM 0x0UL
+#endif
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
@@ -237,6 +253,8 @@ struct io_rsrc_put {
struct io_file_table {
struct io_fixed_file *files;
+ unsigned long *bitmap;
+ unsigned int alloc_hint;
};
struct io_rsrc_node {
@@ -261,10 +279,26 @@ struct io_rsrc_data {
bool quiesce;
};
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
struct io_buffer_list {
- struct list_head list;
- struct list_head buf_list;
+ /*
+ * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+ * then these are classic provided buffers and ->buf_list is used.
+ */
+ union {
+ struct list_head buf_list;
+ struct {
+ struct page **buf_pages;
+ struct io_uring_buf_ring *buf_ring;
+ };
+ };
__u16 bgid;
+
+ /* below is for ring provided buffers */
+ __u16 buf_nr_pages;
+ __u16 nr_entries;
+ __u32 head;
+ __u32 mask;
};
struct io_buffer {
@@ -337,7 +371,7 @@ struct io_ev_fd {
struct rcu_head rcu;
};
-#define IO_BUFFERS_HASH_BITS 5
+#define BGID_ARRAY 64
struct io_ring_ctx {
/* const or read-mostly hot data */
@@ -346,6 +380,7 @@ struct io_ring_ctx {
struct io_rings *rings;
unsigned int flags;
+ enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -353,6 +388,7 @@ struct io_ring_ctx {
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
+ unsigned int syscall_iopoll: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -382,17 +418,21 @@ struct io_ring_ctx {
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
+ atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
@@ -409,9 +449,16 @@ struct io_ring_ctx {
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- unsigned long check_cq_overflow;
+ unsigned long check_cq;
struct {
+ /*
+ * We cache a range of free CQEs we can use, once exhausted it
+ * should go through a slower range setup, see __io_get_cqe()
+ */
+ struct io_uring_cqe *cqe_cached;
+ struct io_uring_cqe *cqe_sentinel;
+
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
@@ -497,7 +544,7 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
- struct io_wq_work_list prior_task_list;
+ struct io_wq_work_list prio_task_list;
struct callback_head task_work;
struct file **registered_rings;
bool task_running;
@@ -546,6 +593,16 @@ struct io_accept {
unsigned long nofile;
};
+struct io_socket {
+ struct file *file;
+ int domain;
+ int type;
+ int protocol;
+ int flags;
+ u32 file_slot;
+ unsigned long nofile;
+};
+
struct io_sync {
struct file *file;
loff_t len;
@@ -557,6 +614,8 @@ struct io_sync {
struct io_cancel {
struct file *file;
u64 addr;
+ u32 flags;
+ s32 fd;
};
struct io_timeout {
@@ -585,7 +644,7 @@ struct io_rw {
struct kiocb kiocb;
u64 addr;
u32 len;
- u32 flags;
+ rwf_t flags;
};
struct io_connect {
@@ -602,9 +661,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
- int bgid;
size_t len;
size_t done_io;
+ unsigned int flags;
};
struct io_open {
@@ -722,6 +781,12 @@ struct io_msg {
u32 len;
};
+struct io_nop {
+ struct file *file;
+ u64 extra1;
+ u64 extra2;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -748,6 +813,12 @@ struct io_async_rw {
struct wait_page_queue wpq;
};
+struct io_xattr {
+ struct file *file;
+ struct xattr_ctx ctx;
+ struct filename *filename;
+};
+
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -766,6 +837,7 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_BUFFER_RING_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
@@ -776,6 +848,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -816,6 +889,8 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* buffer selected from ring, needs commit */
+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -840,6 +915,8 @@ enum {
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ /* fast poll multishot mode */
+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
};
struct async_poll {
@@ -862,6 +939,21 @@ enum {
IORING_RSRC_BUFFER = 1,
};
+struct io_cqe {
+ __u64 user_data;
+ __s32 res;
+ /* fd initially, then cflags for completion */
+ union {
+ __u32 flags;
+ int fd;
+ };
+};
+
+enum {
+ IO_CHECK_CQ_OVERFLOW_BIT,
+ IO_CHECK_CQ_DROPPED_BIT,
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -897,46 +989,65 @@ struct io_kiocb {
struct io_symlink symlink;
struct io_hardlink hardlink;
struct io_msg msg;
+ struct io_xattr xattr;
+ struct io_socket sock;
+ struct io_nop nop;
+ struct io_uring_cmd uring_cmd;
};
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
u16 buf_index;
unsigned int flags;
- u64 user_data;
- u32 result;
- /* fd initially, then cflags for completion */
- union {
- u32 cflags;
- int fd;
- };
+ struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
- struct percpu_ref *fixed_rsrc_refs;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *rsrc_node;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+
+ /*
+ * stores buffer ID for ring provided buffers, valid IFF
+ * REQ_F_BUFFER_RING is set.
+ */
+ struct io_buffer_list *buf_list;
+ };
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
- int apoll_events;
+ __poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
- struct hlist_node hash_node;
+ union {
+ struct hlist_node hash_node;
+ struct {
+ u64 extra1;
+ u64 extra2;
+ };
+ };
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
@@ -956,6 +1067,24 @@ struct io_defer_entry {
u32 seq;
};
+struct io_cancel_data {
+ struct io_ring_ctx *ctx;
+ union {
+ u64 data;
+ struct file *file;
+ };
+ u32 flags;
+ int seq;
+};
+
+/*
+ * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
+ * the following sqe if SQE128 is used.
+ */
+#define uring_cmd_pdu_size(is_sqe128) \
+ ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
+ offsetof(struct io_uring_sqe, cmd))
+
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
@@ -977,12 +1106,20 @@ struct io_op_def {
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
+ /* supports ioprio */
+ unsigned ioprio : 1;
+ /* supports iopoll */
+ unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
+ [IORING_OP_NOP] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ .buffer_select = 1,
+ },
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -991,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -1001,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
@@ -1013,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -1022,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
@@ -1064,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
+ .ioprio = 1, /* used for flags */
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
@@ -1086,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_CLOSE] = {},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
@@ -1097,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1106,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
@@ -1140,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -1160,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_LINKAT] = {},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
+ .iopoll = 1,
+ },
+ [IORING_OP_FSETXATTR] = {
+ .needs_file = 1
+ },
+ [IORING_OP_SETXATTR] = {},
+ [IORING_OP_FGETXATTR] = {
+ .needs_file = 1
+ },
+ [IORING_OP_GETXATTR] = {},
+ [IORING_OP_SOCKET] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_URING_CMD] = {
+ .needs_file = 1,
+ .plug = 1,
+ .needs_async_setup = 1,
+ .async_size = uring_cmd_pdu_size(1),
},
};
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
@@ -1173,10 +1345,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
-
-static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req);
+static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
static void io_dismantle_req(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1185,10 +1354,10 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
static void io_clean_op(struct io_kiocb *req);
static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
-static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
static void io_drop_inflight_file(struct io_kiocb *req);
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
-static void __io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
@@ -1201,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static void io_eventfd_signal(struct io_ring_ctx *ctx);
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
+const char *io_uring_get_opcode(u8 opcode)
+{
+ switch ((enum io_uring_op)opcode) {
+ case IORING_OP_NOP:
+ return "NOP";
+ case IORING_OP_READV:
+ return "READV";
+ case IORING_OP_WRITEV:
+ return "WRITEV";
+ case IORING_OP_FSYNC:
+ return "FSYNC";
+ case IORING_OP_READ_FIXED:
+ return "READ_FIXED";
+ case IORING_OP_WRITE_FIXED:
+ return "WRITE_FIXED";
+ case IORING_OP_POLL_ADD:
+ return "POLL_ADD";
+ case IORING_OP_POLL_REMOVE:
+ return "POLL_REMOVE";
+ case IORING_OP_SYNC_FILE_RANGE:
+ return "SYNC_FILE_RANGE";
+ case IORING_OP_SENDMSG:
+ return "SENDMSG";
+ case IORING_OP_RECVMSG:
+ return "RECVMSG";
+ case IORING_OP_TIMEOUT:
+ return "TIMEOUT";
+ case IORING_OP_TIMEOUT_REMOVE:
+ return "TIMEOUT_REMOVE";
+ case IORING_OP_ACCEPT:
+ return "ACCEPT";
+ case IORING_OP_ASYNC_CANCEL:
+ return "ASYNC_CANCEL";
+ case IORING_OP_LINK_TIMEOUT:
+ return "LINK_TIMEOUT";
+ case IORING_OP_CONNECT:
+ return "CONNECT";
+ case IORING_OP_FALLOCATE:
+ return "FALLOCATE";
+ case IORING_OP_OPENAT:
+ return "OPENAT";
+ case IORING_OP_CLOSE:
+ return "CLOSE";
+ case IORING_OP_FILES_UPDATE:
+ return "FILES_UPDATE";
+ case IORING_OP_STATX:
+ return "STATX";
+ case IORING_OP_READ:
+ return "READ";
+ case IORING_OP_WRITE:
+ return "WRITE";
+ case IORING_OP_FADVISE:
+ return "FADVISE";
+ case IORING_OP_MADVISE:
+ return "MADVISE";
+ case IORING_OP_SEND:
+ return "SEND";
+ case IORING_OP_RECV:
+ return "RECV";
+ case IORING_OP_OPENAT2:
+ return "OPENAT2";
+ case IORING_OP_EPOLL_CTL:
+ return "EPOLL_CTL";
+ case IORING_OP_SPLICE:
+ return "SPLICE";
+ case IORING_OP_PROVIDE_BUFFERS:
+ return "PROVIDE_BUFFERS";
+ case IORING_OP_REMOVE_BUFFERS:
+ return "REMOVE_BUFFERS";
+ case IORING_OP_TEE:
+ return "TEE";
+ case IORING_OP_SHUTDOWN:
+ return "SHUTDOWN";
+ case IORING_OP_RENAMEAT:
+ return "RENAMEAT";
+ case IORING_OP_UNLINKAT:
+ return "UNLINKAT";
+ case IORING_OP_MKDIRAT:
+ return "MKDIRAT";
+ case IORING_OP_SYMLINKAT:
+ return "SYMLINKAT";
+ case IORING_OP_LINKAT:
+ return "LINKAT";
+ case IORING_OP_MSG_RING:
+ return "MSG_RING";
+ case IORING_OP_FSETXATTR:
+ return "FSETXATTR";
+ case IORING_OP_SETXATTR:
+ return "SETXATTR";
+ case IORING_OP_FGETXATTR:
+ return "FGETXATTR";
+ case IORING_OP_GETXATTR:
+ return "GETXATTR";
+ case IORING_OP_SOCKET:
+ return "SOCKET";
+ case IORING_OP_URING_CMD:
+ return "URING_CMD";
+ case IORING_OP_LAST:
+ return "INVALID";
+ }
+ return "INVALID";
+}
+
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
@@ -1219,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+#if defined(IO_URING_SCM_ALL)
+ return true;
+#else
+ return !!unix_get_socket(filp);
+#endif
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+ return false;
+}
+#endif
+
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ /*
+ * "Normal" inline submissions always hold the uring_lock, since we
+ * grab it from the system call. Same is true for the SQPOLL offload.
+ * The only exception is when we've detached the request and issue it
+ * from an async worker thread, grab the lock for that case.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
+}
+
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
@@ -1280,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
#define IO_RSRC_REF_BATCH 100
+static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+ percpu_ref_put_many(&node->refs, nr);
+}
+
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct percpu_ref *ref = req->fixed_rsrc_refs;
+ struct io_rsrc_node *node = req->rsrc_node;
- if (ref) {
- if (ref == &ctx->rsrc_node->refs)
+ if (node) {
+ if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
- percpu_ref_put(ref);
+ io_rsrc_put_node(node, 1);
}
}
-static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc(struct io_kiocb *req)
{
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
+ if (req->rsrc_node)
+ io_rsrc_put_node(req->rsrc_node, 1);
}
static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
if (ctx->rsrc_cached_refs) {
- percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
ctx->rsrc_cached_refs = 0;
}
}
@@ -1320,8 +1634,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
- if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
+ if (!req->rsrc_node) {
+ req->rsrc_node = ctx->rsrc_node;
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
lockdep_assert_held(&ctx->uring_lock);
@@ -1329,28 +1643,30 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
} else {
- percpu_ref_get(req->fixed_rsrc_refs);
+ percpu_ref_get(&req->rsrc_node->refs);
}
}
}
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list)
+ req->buf_list->head++;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ } else {
+ list_add(&req->kbuf->list, list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
+ return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
@@ -1360,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
{
unsigned int cflags;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
/*
@@ -1375,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (req->flags & REQ_F_BUFFER_RING) {
+ /* no buffers to recycle for this case */
+ cflags = __io_put_kbuf(req, NULL);
+ } else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
@@ -1393,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
-
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
+ if (ctx->io_bl && bgid < BGID_ARRAY)
+ return &ctx->io_bl[bgid];
- return NULL;
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1410,25 +1724,33 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
/* don't recycle if we already did IO to this buffer */
if (req->flags & REQ_F_PARTIAL_IO)
return;
+ /*
+ * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+ * the flag and hence ensure that bl->head doesn't get incremented.
+ * If the tail has already been incremented, hang on to it.
+ */
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list) {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
+ return;
+ }
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
+ req->buf_index = buf->bgid;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1469,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req)
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
req_set_fail(req);
- req->result = res;
+ req->cqe.res = res;
+}
+
+static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1506,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i, hash_bits;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ xa_init(&ctx->io_bl_xa);
+
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1533,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1575,7 +1897,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
@@ -1599,10 +1922,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
@@ -1629,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return __io_prep_linked_timeout(req);
}
+static noinline void __io_arm_ltimeout(struct io_kiocb *req)
+{
+ io_queue_linked_timeout(__io_prep_linked_timeout(req));
+}
+
+static inline void io_arm_ltimeout(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
+ __io_arm_ltimeout(req);
+}
+
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1641,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req)
req->work.list.next = NULL;
req->work.flags = 0;
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1672,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req)
static inline void io_req_add_compl_list(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
+ struct io_submit_state *state = &req->ctx->submit_state;
if (!(req->flags & REQ_F_CQE_SKIP))
- ctx->submit_state.flush_cqes = true;
+ state->flush_cqes = true;
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
-static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
+static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
@@ -1702,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
- &req->work, io_wq_is_hashed(&req->work));
+ trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
+ req->opcode, req->flags, &req->work,
+ io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1721,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_fill_cqe_req(req, status, 0);
- io_put_req_deferred(req);
+ io_req_tw_post_queue(req, status, 0);
}
}
@@ -1804,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- unsigned tail, mask = ctx->cq_entries - 1;
-
- /*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
- if (__io_cqring_events(ctx) == ctx->cq_entries)
+ unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+ unsigned int shift = 0;
+ unsigned int free, queued, len;
+
+ if (ctx->flags & IORING_SETUP_CQE32)
+ shift = 1;
+
+ /* userspace may cheat modifying the tail, be safe and do min */
+ queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+ free = ctx->cq_entries - queued;
+ /* we need a contiguous range, limit based on the current array offset */
+ len = min(free, ctx->cq_entries - off);
+ if (!len)
return NULL;
- tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & mask];
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached = &rings->cqes[off];
+ ctx->cqe_sentinel = ctx->cqe_cached + len;
+ ctx->cqe_cached++;
+ return &rings->cqes[off << shift];
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+ if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
+ struct io_uring_cqe *cqe = ctx->cqe_cached;
+
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
+
+ cqe += off;
+ }
+
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached++;
+ return cqe;
+ }
+
+ return __io_get_cqe(ctx);
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
@@ -1889,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool all_flushed, posted;
+ size_t cqe_size = sizeof(struct io_uring_cqe);
if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
return false;
+ if (ctx->flags & IORING_SETUP_CQE32)
+ cqe_size <<= 1;
+
posted = false;
spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->cq_overflow_list)) {
@@ -1904,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
if (cqe)
- memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+ memcpy(cqe, &ocqe->cqe, cqe_size);
else
io_account_cq_overflow(ctx);
@@ -1915,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+ clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- if (posted)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -1932,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
bool ret = true;
- if (test_bit(0, &ctx->check_cq_overflow)) {
+ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
@@ -1944,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
return ret;
}
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static void __io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
- if (likely(task == current)) {
- tctx->cached_refs += nr;
- } else {
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
- }
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+}
+
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+ if (likely(task == current))
+ task->io_uring->cached_refs += nr;
+ else
+ __io_put_task(task, nr);
}
static void io_task_refs_refill(struct io_uring_task *tctx)
@@ -1990,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
+ s32 res, u32 cflags, u64 extra1,
+ u64 extra2)
{
struct io_overflow_cqe *ocqe;
+ size_t ocq_size = sizeof(struct io_overflow_cqe);
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+ if (is_cqe32)
+ ocq_size += sizeof(struct io_uring_cqe);
- ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
@@ -2002,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* on the floor.
*/
io_account_cq_overflow(ctx);
+ set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+ set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
+ if (is_cqe32) {
+ ocqe->cqe.big_cqe[0] = extra1;
+ ocqe->cqe.big_cqe[1] = extra2;
+ }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
@@ -2034,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
WRITE_ONCE(cqe->flags, cflags);
return true;
}
- return io_cqring_event_overflow(ctx, user_data, res, cflags);
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+}
+
+static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_uring_cqe *cqe;
+
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(*cqe));
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
+}
+
+static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_uring_cqe *cqe;
+ u64 extra1 = req->extra1;
+ u64 extra2 = req->extra2;
+
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, extra1, extra2);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
+ cqe->big_cqe[0] = extra1;
+ cqe->big_cqe[1] = extra2;
+ return true;
+ }
+
+ return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
+ req->cqe.flags, extra1, extra2);
}
static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
- trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
- return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
+ return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
}
-static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
+ u64 extra1, u64 extra2)
{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_cqe *cqe;
+
+ if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ return;
+ if (req->flags & REQ_F_CQE_SKIP)
+ return;
+
+ trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
+ extra1, extra2);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ WRITE_ONCE(cqe->user_data, req->cqe.user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, cflags);
+ WRITE_ONCE(cqe->big_cqe[0], extra1);
+ WRITE_ONCE(cqe->big_cqe[1], extra2);
+ return;
+ }
+
+ io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
-static void __io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
+static void __io_req_complete_put(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
@@ -2077,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
req->link = NULL;
}
}
- io_req_put_rsrc(req, ctx);
+ io_req_put_rsrc(req);
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
@@ -2091,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
}
}
-static void io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
+static void __io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
+ __io_req_complete_put(req);
+}
+
+static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe32_req(req, res, cflags, extra1, extra2);
+ __io_req_complete_put(req);
+}
+
+static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2103,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
io_cqring_ev_posted(ctx);
}
+static void io_req_complete_post32(struct io_kiocb *req, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ __io_req_complete_post32(req, res, cflags, extra1, extra2);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+}
+
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
- req->result = res;
- req->cflags = cflags;
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
@@ -2120,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
+static inline void __io_req_complete32(struct io_kiocb *req,
+ unsigned int issue_flags, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
+ io_req_complete_state(req, res, cflags);
+ req->extra1 = extra1;
+ req->extra2 = extra2;
+ } else {
+ io_req_complete_post32(req, res, cflags, extra1, extra2);
+ }
+}
+
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
+ if (res < 0)
+ req_set_fail(req);
__io_req_complete(req, 0, res, 0);
}
@@ -2131,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res)
io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
-static void io_req_complete_fail_submit(struct io_kiocb *req)
-{
- /*
- * We don't submit, fail them all, for that replace hardlinks with
- * normal links. Extra REQ_F_LINK is tolerated.
- */
- req->flags &= ~REQ_F_HARDLINK;
- req->flags |= REQ_F_LINK;
- io_req_complete_failed(req, req->result);
-}
-
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
@@ -2152,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
- req->result = 0;
+ req->cqe.res = 0;
}
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -2164,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
{
- struct io_submit_state *state = &ctx->submit_state;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
- io_flush_cached_locked_reqs(ctx, state);
- return !!state->free_list.next;
+ return !ctx->submit_state.free_list.next;
}
/*
@@ -2188,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
void *reqs[IO_REQ_ALLOC_BATCH];
- struct io_kiocb *req;
int ret, i;
- if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
- return true;
+ /*
+ * If we have more than a batch's worth of requests in our IRQ side
+ * locked cache, grab the lock and move them over to our submission
+ * side cache.
+ */
+ if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+ io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
+ if (!io_req_cache_empty(ctx))
+ return true;
+ }
ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
@@ -2212,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
percpu_ref_get_many(&ctx->refs, ret);
for (i = 0; i < ret; i++) {
- req = reqs[i];
+ struct io_kiocb *req = reqs[i];
io_preinit_req(req, ctx);
- wq_stack_add_head(&req->comp_list, &state->free_list);
+ io_req_add_to_cache(req, ctx);
}
return true;
}
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
{
- if (unlikely(!ctx->submit_state.free_list.next))
+ if (unlikely(io_req_cache_empty(ctx)))
return __io_alloc_req_refill(ctx);
return true;
}
@@ -2251,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req)
io_put_file(req->file);
}
-static __cold void __io_free_req(struct io_kiocb *req)
+static __cold void io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- io_req_put_rsrc(req, ctx);
+ io_req_put_rsrc(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
@@ -2273,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
nxt->link = NULL;
}
-static bool io_kill_linked_timeout(struct io_kiocb *req)
+static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
@@ -2286,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&link->timeout.list);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
- return true;
+ return link;
}
}
- return false;
+ return NULL;
}
static void io_fail_links(struct io_kiocb *req)
@@ -2306,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req)
long res = -ECANCELED;
if (link->flags & REQ_F_FAIL)
- res = link->result;
+ res = link->cqe.res;
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
req->opcode, link);
- if (!ignore_cqes) {
+ if (ignore_cqes)
+ link->flags |= REQ_F_CQE_SKIP;
+ else
link->flags &= ~REQ_F_CQE_SKIP;
- io_fill_cqe_req(link, res, 0);
- }
- io_put_req_deferred(link);
+ __io_req_complete_post(link, res, 0);
link = nxt;
}
}
@@ -2326,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req)
static bool io_disarm_next(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
+ struct io_kiocb *link = NULL;
bool posted = false;
if (req->flags & REQ_F_ARM_LTIMEOUT) {
- struct io_kiocb *link = req->link;
-
+ link = req->link;
req->flags &= ~REQ_F_ARM_LTIMEOUT;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
io_remove_next_linked(req);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
+ io_req_tw_post_queue(link, -ECANCELED, 0);
posted = true;
}
} else if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
- posted = io_kill_linked_timeout(req);
+ link = io_disarm_linked_timeout(req);
spin_unlock_irq(&ctx->timeout_lock);
+ if (link) {
+ posted = true;
+ io_req_tw_post_queue(link, -ECANCELED, 0);
+ }
}
if (unlikely((req->flags & REQ_F_FAIL) &&
!(req->flags & REQ_F_HARDLINK))) {
@@ -2361,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req)
spin_lock(&ctx->completion_lock);
posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -2372,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
@@ -2391,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (*locked) {
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -2434,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result,
+ __io_req_complete_post(req, req->cqe.res,
io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2475,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node1, *node2;
- if (!tctx->task_list.first &&
- !tctx->prior_task_list.first && uring_locked)
- io_submit_flush_completions(ctx);
-
spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prior_task_list.first;
+ node1 = tctx->prio_task_list.first;
node2 = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
+ INIT_WQ_LIST(&tctx->prio_task_list);
if (!node2 && !node1)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
@@ -2492,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb)
if (node1)
handle_prev_tw_list(node1, &ctx, &uring_locked);
-
if (node2)
handle_tw_list(node2, &ctx, &uring_locked);
cond_resched();
+
+ if (data_race(!tctx->task_list.first) &&
+ data_race(!tctx->prio_task_list.first) && uring_locked)
+ io_submit_flush_completions(ctx);
}
ctx_flush_and_put(ctx, &uring_locked);
@@ -2505,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb)
io_uring_drop_tctx_refs(current);
}
-static void io_req_task_work_add(struct io_kiocb *req, bool priority)
+static void __io_req_task_work_add(struct io_kiocb *req,
+ struct io_uring_task *tctx,
+ struct io_wq_work_list *list)
{
- struct task_struct *tsk = req->task;
- struct io_uring_task *tctx = tsk->io_uring;
- enum task_work_notify_mode notify;
+ struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
unsigned long flags;
bool running;
- WARN_ON_ONCE(!tctx);
-
io_drop_inflight_file(req);
spin_lock_irqsave(&tctx->task_lock, flags);
- if (priority)
- wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
- else
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ wq_list_add_tail(&req->io_task_work.node, list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2532,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
if (running)
return;
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
- if (notify == TWA_NONE)
- wake_up_process(tsk);
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+ if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
- }
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+ node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2559,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
}
}
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+static void io_req_task_work_add(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
+ __io_req_task_work_add(req, tctx, &tctx->task_list);
+}
+
+static void io_req_task_prio_work_add(struct io_kiocb *req)
+{
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ if (req->ctx->flags & IORING_SETUP_SQPOLL)
+ __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
+ else
+ __io_req_task_work_add(req, tctx, &tctx->task_list);
+}
+
+static void io_req_tw_post(struct io_kiocb *req, bool *locked)
+{
+ io_req_complete_post(req, req->cqe.res, req->cqe.flags);
+}
+
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
+ req->io_task_work.func = io_req_tw_post;
+ io_req_task_work_add(req);
+}
+
+static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+{
/* not needed for normal modes, but SQPOLL depends on it */
- io_tw_lock(ctx, locked);
- io_req_complete_failed(req, req->result);
+ io_tw_lock(req->ctx, locked);
+ io_req_complete_failed(req, req->cqe.res);
}
static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_tw_lock(ctx, locked);
+ io_tw_lock(req->ctx, locked);
/* req->task == current here, checking PF_EXITING is safe */
if (likely(!(req->task->flags & PF_EXITING)))
- __io_queue_sqe(req);
+ io_queue_sqe(req);
else
io_req_complete_failed(req, -EFAULT);
}
static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
- req->result = ret;
+ req->cqe.res = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
- req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req, false);
+ req->io_task_work.func = io_queue_iowq;
+ io_req_task_work_add(req);
}
-static inline void io_queue_next(struct io_kiocb *req)
+static void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2607,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
-static void io_free_req(struct io_kiocb *req)
-{
- io_queue_next(req);
- __io_free_req(req);
-}
-
-static void io_free_req_work(struct io_kiocb *req, bool *locked)
-{
- io_free_req(req);
-}
-
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@@ -2629,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (unlikely(req->flags & REQ_F_REFCOUNT)) {
- node = req->comp_list.next;
- if (!req_ref_put_and_test(req))
- continue;
+ if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req->flags & REQ_F_REFCOUNT) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
+ if (req->flags & IO_REQ_LINK_FLAGS)
+ io_queue_next(req);
+ if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
+ io_clean_op(req);
}
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ io_put_file(req->file);
io_req_put_rsrc_locked(req, ctx);
- io_queue_next(req);
- io_dismantle_req(req);
if (req->task != task) {
if (task)
@@ -2647,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
}
task_refs++;
node = req->comp_list.next;
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ io_req_add_to_cache(req, ctx);
} while (node);
if (task)
@@ -2666,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, req->result, req->cflags);
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- struct async_poll *apoll = req->apoll;
-
- if (apoll->double_poll)
- kfree(apoll->double_poll);
- list_add(&apoll->poll.wait.entry,
- &ctx->apoll_cache);
- req->flags &= ~REQ_F_POLLED;
+ if (!(req->flags & REQ_F_CQE_SKIP)) {
+ if (!(ctx->flags & IORING_SETUP_CQE32))
+ __io_fill_cqe_req_filled(ctx, req);
+ else
+ __io_fill_cqe32_req_filled(ctx, req);
}
}
@@ -2698,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
struct io_kiocb *nxt = NULL;
if (req_ref_put_and_test(req)) {
- nxt = io_req_find_next(req);
- __io_free_req(req);
+ if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
+ nxt = io_req_find_next(req);
+ io_free_req(req);
}
return nxt;
}
static inline void io_put_req(struct io_kiocb *req)
{
- if (req_ref_put_and_test(req))
- io_free_req(req);
-}
-
-static inline void io_put_req_deferred(struct io_kiocb *req)
-{
if (req_ref_put_and_test(req)) {
- req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req, false);
+ io_queue_next(req);
+ io_free_req(req);
}
}
@@ -2800,7 +3282,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
+ __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
}
if (unlikely(!nr_events))
@@ -2846,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
int ret = 0;
+ unsigned long check_cq;
/*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- /*
* Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (test_bit(0, &ctx->check_cq_overflow))
+ check_cq = READ_ONCE(ctx->check_cq);
+ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
__io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
- goto out;
+ return 0;
+
+ /*
+ * Similarly do not spin if we have not informed the user of any
+ * dropped CQE.
+ */
+ if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+ return -EBADR;
+
do {
/*
* If a submit got punted to a workqueue, we can have the
@@ -2891,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
nr_events += ret;
ret = 0;
} while (nr_events < min && !need_resched());
-out:
- mutex_unlock(&ctx->uring_lock);
+
return ret;
}
@@ -2965,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
} else {
fsnotify_access(req->file);
}
- if (unlikely(res != req->result)) {
+ if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
return true;
}
req_set_fail(req);
- req->result = res;
+ req->cqe.res = res;
}
return false;
}
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- int res = req->result;
+ int res = req->cqe.res;
if (*locked) {
io_req_complete_state(req, res, io_put_kbuf(req, 0));
@@ -2995,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result,
+ __io_req_complete(req, issue_flags, req->cqe.res,
io_put_kbuf(req, issue_flags));
}
@@ -3005,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
if (__io_complete_rw_common(req, res))
return;
- req->result = res;
+ req->cqe.res = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
+ io_req_task_prio_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@@ -3016,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (unlikely(res != req->result)) {
+ if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
return;
}
- req->result = res;
+ req->cqe.res = res;
}
/* order with io_iopoll_complete() checking ->iopoll_completed */
@@ -3131,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file)
res |= FFS_ISREG;
if (__io_file_supports_nowait(file, mode))
res |= FFS_NOWAIT;
+ if (io_file_need_scm(file))
+ res |= FFS_SCM;
return res;
}
@@ -3162,6 +3649,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
+ /* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3310,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
return __io_import_fixed(req, rw, iter, imu);
}
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
-{
- if (needs_lock)
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
{
- /*
- * "Normal" inline submissions always hold the uring_lock, since we
- * grab it from the system call. Same is true for the SQPOLL offload.
- * The only exception is when we've detached the request and issue it
- * from an async worker thread, grab the lock for that case.
- */
- if (needs_lock)
- mutex_lock(&ctx->uring_lock);
-}
-
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
-{
- struct list_head *list;
-
- list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- INIT_LIST_HEAD(&bl->buf_list);
bl->bgid = bgid;
- list_add(&bl->list, list);
+ if (bgid < BGID_ARRAY)
+ return 0;
+
+ return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, unsigned int issue_flags)
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl)
{
- struct io_buffer *kbuf = req->kbuf;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
-
- if (req->flags & REQ_F_BUFFER_SELECTED)
- return kbuf;
-
- io_ring_submit_lock(ctx, needs_lock);
+ if (!list_empty(&bl->buf_list)) {
+ struct io_buffer *kbuf;
- lockdep_assert_held(&ctx->uring_lock);
-
- bl = io_buffer_get_list(ctx, bgid);
- if (bl && !list_empty(&bl->buf_list)) {
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
+ req->buf_index = kbuf->bid;
+ return u64_to_user_ptr(kbuf->addr);
+ }
+ return NULL;
+}
+
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_uring_buf_ring *br = bl->buf_ring;
+ struct io_uring_buf *buf;
+ __u32 head = bl->head;
+
+ if (unlikely(smp_load_acquire(&br->tail) == head)) {
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return NULL;
+ }
+
+ head &= bl->mask;
+ if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+ buf = &br->bufs[head];
} else {
- kbuf = ERR_PTR(-ENOBUFS);
+ int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ buf = page_address(bl->buf_pages[index]);
+ buf += off;
}
+ if (*len > buf->len)
+ *len = buf->len;
+ req->flags |= REQ_F_BUFFER_RING;
+ req->buf_list = bl;
+ req->buf_index = buf->bid;
- io_ring_submit_unlock(req->ctx, needs_lock);
- return kbuf;
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ /*
+ * If we came in unlocked, we have no choice but to consume the
+ * buffer here. This does mean it'll be pinned until the IO
+ * completes. But coming in unlocked means we're in io-wq
+ * context, hence there should be no further retry. For the
+ * locked case, the caller must ensure to call the commit when
+ * the transfer completes (or if we get -EAGAIN and must poll
+ * or retry).
+ */
+ req->buf_list = NULL;
+ bl->head++;
+ }
+ return u64_to_user_ptr(buf->addr);
}
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned int issue_flags)
{
- struct io_buffer *kbuf;
- u16 bgid;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ void __user *ret = NULL;
- bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, issue_flags);
- if (IS_ERR(kbuf))
- return kbuf;
- return u64_to_user_ptr(kbuf->addr);
+ io_ring_submit_lock(req->ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (likely(bl)) {
+ if (bl->buf_nr_pages)
+ ret = io_ring_buffer_select(req, len, bl, issue_flags);
+ else
+ ret = io_provided_buffer_select(req, len, bl);
+ }
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -3390,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
struct compat_iovec __user *uiov;
compat_ssize_t clen;
void __user *buf;
- ssize_t len;
+ size_t len;
uiov = u64_to_user_ptr(req->rw.addr);
if (!access_ok(uiov, sizeof(*uiov)))
@@ -3401,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = (compat_size_t) len;
+ req->rw.len = iov[0].iov_len = (compat_size_t) len;
return 0;
}
#endif
@@ -3423,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = len;
+ req->rw.len = iov[0].iov_len = len;
return 0;
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
unsigned int issue_flags)
{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf = req->kbuf;
-
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov[0].iov_len = kbuf->len;
+ if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+ iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
+ iov[0].iov_len = req->rw.len;
return 0;
}
if (req->rw.len != 1)
@@ -3452,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, issue_flags);
}
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return false;
+ return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
struct io_rw_state *s,
unsigned int issue_flags)
@@ -3470,18 +3984,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
return NULL;
}
- /* buffer index only valid with fixed read/write, or buffer select */
- if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
- return ERR_PTR(-EINVAL);
-
buf = u64_to_user_ptr(req->rw.addr);
sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
- if (IS_ERR(buf))
- return ERR_CAST(buf);
+ if (io_do_buffer_select(req)) {
+ buf = io_buffer_select(req, &sqe_len, issue_flags);
+ if (!buf)
+ return ERR_PTR(-ENOBUFS);
+ req->rw.addr = (unsigned long) buf;
req->rw.len = sqe_len;
}
@@ -3783,6 +4294,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -3835,7 +4347,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kfree(iovec);
return ret;
}
- req->result = iov_iter_count(&s->iter);
+ req->cqe.res = iov_iter_count(&s->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
@@ -3851,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(READ, req->file, ppos, req->result);
+ ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3873,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret == req->result || ret <= 0 || !force_nonblock ||
+ } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3963,7 +4475,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kfree(iovec);
return ret;
}
- req->result = iov_iter_count(&s->iter);
+ req->cqe.res = iov_iter_count(&s->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
@@ -3983,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(WRITE, req->file, ppos, req->result);
+ ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
if (unlikely(ret))
goto out_free;
@@ -4047,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req,
struct io_rename *ren = &req->rename;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4086,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
ren->newpath, ren->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
+static inline void __io_xattr_finish(struct io_kiocb *req)
+{
+ struct io_xattr *ix = &req->xattr;
+
+ if (ix->filename)
+ putname(ix->filename);
+
+ kfree(ix->ctx.kname);
+ kvfree(ix->ctx.kvalue);
+}
+
+static void io_xattr_finish(struct io_kiocb *req, int ret)
+{
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ __io_xattr_finish(req);
+ io_req_complete(req, ret);
+}
+
+static int __io_getxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *name;
+ int ret;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ix->filename = NULL;
+ ix->ctx.kvalue = NULL;
+ name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ix->ctx.size = READ_ONCE(sqe->len);
+ ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+ if (ix->ctx.flags)
+ return -EINVAL;
+
+ ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+ if (!ix->ctx.kname)
+ return -ENOMEM;
+
+ ret = strncpy_from_user(ix->ctx.kname->name, name,
+ sizeof(ix->ctx.kname->name));
+ if (!ret || ret == sizeof(ix->ctx.kname->name))
+ ret = -ERANGE;
+ if (ret < 0) {
+ kfree(ix->ctx.kname);
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_fgetxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_getxattr_prep(req, sqe);
+}
+
+static int io_getxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *path;
+ int ret;
+
+ ret = __io_getxattr_prep(req, sqe);
+ if (ret)
+ return ret;
+
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ if (IS_ERR(ix->filename)) {
+ ret = PTR_ERR(ix->filename);
+ ix->filename = NULL;
+ }
+
+ return ret;
+}
+
+static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
+ req->file->f_path.dentry,
+ &ix->ctx);
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
+static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+retry:
+ ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+ if (!ret) {
+ ret = do_getxattr(mnt_user_ns(path.mnt),
+ path.dentry,
+ &ix->ctx);
+
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ }
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
+static int __io_setxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *name;
+ int ret;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ix->filename = NULL;
+ name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ix->ctx.kvalue = NULL;
+ ix->ctx.size = READ_ONCE(sqe->len);
+ ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+ ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+ if (!ix->ctx.kname)
+ return -ENOMEM;
+
+ ret = setxattr_copy(name, &ix->ctx);
+ if (ret) {
+ kfree(ix->ctx.kname);
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_setxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *path;
+ int ret;
+
+ ret = __io_setxattr_prep(req, sqe);
+ if (ret)
+ return ret;
+
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ if (IS_ERR(ix->filename)) {
+ ret = PTR_ERR(ix->filename);
+ ix->filename = NULL;
+ }
+
+ return ret;
+}
+
+static int io_fsetxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_setxattr_prep(req, sqe);
+}
+
+static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
+ struct path *path)
+{
+ struct io_xattr *ix = &req->xattr;
+ int ret;
+
+ ret = mnt_want_write(path->mnt);
+ if (!ret) {
+ ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+ mnt_drop_write(path->mnt);
+ }
+
+ return ret;
+}
+
+static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+ io_xattr_finish(req, ret);
+
+ return 0;
+}
+
+static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+retry:
+ ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+ if (!ret) {
+ ret = __io_setxattr(req, issue_flags, &path);
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ }
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
static int io_unlinkat_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_unlink *un = &req->unlink;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4135,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_unlinkat(un->dfd, un->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4147,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
struct io_mkdir *mkd = &req->mkdir;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4178,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4190,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
struct io_symlink *sl = &req->symlink;
const char __user *oldpath, *newpath;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4227,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4239,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req,
struct io_hardlink *lnk = &req->hardlink;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4278,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
lnk->newpath, lnk->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+{
+ req->uring_cmd.task_work_cb(&req->uring_cmd);
+}
+
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *))
+{
+ struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+ req->uring_cmd.task_work_cb = task_work_cb;
+ req->io_task_work.func = io_uring_cmd_work;
+ io_req_task_prio_work_add(req);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+
+/*
+ * Called by consumers of io_uring_cmd, if they originally returned
+ * -EIOCBQUEUED upon receiving the command.
+ */
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
+{
+ struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
if (ret < 0)
req_set_fail(req);
- io_req_complete(req, ret);
+ if (req->ctx->flags & IORING_SETUP_CQE32)
+ __io_req_complete32(req, 0, ret, 0, res2, 0);
+ else
+ io_req_complete(req, ret);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+
+static int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+ size_t cmd_size;
+
+ cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
+
+ memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
+ return 0;
+}
+
+static int io_uring_cmd_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_uring_cmd *ioucmd = &req->uring_cmd;
+
+ if (sqe->rw_flags)
+ return -EINVAL;
+ ioucmd->cmd = sqe->cmd;
+ ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+ return 0;
+}
+
+static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_uring_cmd *ioucmd = &req->uring_cmd;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = req->file;
+ int ret;
+
+ if (!req->file->f_op->uring_cmd)
+ return -EOPNOTSUPP;
+
+ if (ctx->flags & IORING_SETUP_SQE128)
+ issue_flags |= IO_URING_F_SQE128;
+ if (ctx->flags & IORING_SETUP_CQE32)
+ issue_flags |= IO_URING_F_CQE32;
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ issue_flags |= IO_URING_F_IOPOLL;
+
+ if (req_has_async_data(req))
+ ioucmd->cmd = req->async_data;
+
+ ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+ if (ret == -EAGAIN) {
+ if (!req_has_async_data(req)) {
+ if (io_alloc_async_data(req))
+ return -ENOMEM;
+ io_uring_cmd_prep_async(req);
+ }
+ return -EAGAIN;
+ }
+
+ if (ret != -EIOCBQUEUED)
+ io_uring_cmd_done(ioucmd, ret, 0);
return 0;
}
@@ -4288,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
@@ -4315,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
ret = __sys_shutdown_sock(sock, req->shutdown.how);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4330,9 +5145,6 @@ static int __io_splice_prep(struct io_kiocb *req,
struct io_splice *sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
@@ -4377,7 +5189,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
return 0;
}
@@ -4422,7 +5234,20 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
+ return 0;
+}
+
+static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ /*
+ * If the ring is setup with CQE32, relay back addr/addr
+ */
+ if (req->ctx->flags & IORING_SETUP_CQE32) {
+ req->nop.extra1 = READ_ONCE(sqe->addr);
+ req->nop.extra2 = READ_ONCE(sqe->addr2);
+ }
+
return 0;
}
@@ -4431,20 +5256,31 @@ done:
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
+ unsigned int cflags;
+ void __user *buf;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ size_t len = 1;
- __io_req_complete(req, issue_flags, 0, 0);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ }
+
+ cflags = io_put_kbuf(req, issue_flags);
+ if (!(req->ctx->flags & IORING_SETUP_CQE32))
+ __io_req_complete(req, issue_flags, 0, cflags);
+ else
+ __io_req_complete32(req, issue_flags, 0, cflags,
+ req->nop.extra1, req->nop.extra2);
return 0;
}
static int io_msg_ring_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
- sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+ sqe->buf_index || sqe->personality))
return -EINVAL;
req->msg.user_data = READ_ONCE(sqe->off);
@@ -4480,17 +5316,15 @@ done:
if (ret < 0)
req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
return 0;
}
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4514,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4523,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -4544,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
- if (ret < 0)
- req_set_fail(req);
- else
+ if (ret >= 0)
fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
@@ -4557,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4614,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_openat_prep(req, sqe);
}
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+ struct io_file_table *table = &ctx->file_table;
+ unsigned long nr = ctx->nr_user_files;
+ int ret;
+
+ if (table->alloc_hint >= nr)
+ table->alloc_hint = 0;
+
+ do {
+ ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+ if (ret != nr) {
+ table->alloc_hint = ret + 1;
+ return ret;
+ }
+ if (!table->alloc_hint)
+ break;
+
+ nr = table->alloc_hint;
+ table->alloc_hint = 0;
+ } while (1);
+
+ return -ENFILE;
+}
+
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+ struct file *file, unsigned int file_slot)
+{
+ bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ if (alloc_slot) {
+ io_ring_submit_lock(ctx, issue_flags);
+ ret = io_file_bitmap_get(ctx);
+ if (unlikely(ret < 0)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+ }
+
+ file_slot = ret;
+ } else {
+ file_slot--;
+ }
+
+ ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+ if (alloc_slot) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ if (!ret)
+ return file_slot;
+ }
+
+ return ret;
+}
+
static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
@@ -4669,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
fd_install(ret, file);
else
- ret = io_install_fixed_file(req, file, issue_flags,
- req->open.file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ req->open.file_slot);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -4691,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
@@ -4714,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
+ if (bl->buf_nr_pages) {
+ int j;
+
+ i = bl->buf_ring->tail - bl->head;
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ kvfree(bl->buf_pages);
+ bl->buf_pages = NULL;
+ bl->buf_nr_pages = 0;
+ /* make sure it's seen as empty */
+ INIT_LIST_HEAD(&bl->buf_list);
+ return i;
+ }
+
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
@@ -4735,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
- if (bl)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ if (bl) {
+ ret = -EINVAL;
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (!bl->buf_nr_pages)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ }
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return 0;
}
@@ -4761,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -4858,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+ int i;
+
+ ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+ GFP_KERNEL);
+ if (!ctx->io_bl)
+ return -ENOMEM;
+
+ for (i = 0; i < BGID_ARRAY; i++) {
+ INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
+ ctx->io_bl[i].bgid = i;
+ }
+
+ return 0;
+}
+
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
- lockdep_assert_held(&ctx->uring_lock);
+ if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
+ ret = io_init_bl_list(ctx);
+ if (ret)
+ goto err;
+ }
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
- bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl) {
ret = -ENOMEM;
goto err;
}
- io_buffer_add_list(ctx, bl, p->bgid);
+ INIT_LIST_HEAD(&bl->buf_list);
+ ret = io_buffer_add_list(ctx, bl, p->bgid);
+ if (ret) {
+ kfree(bl);
+ goto err;
+ }
+ }
+ /* can't add buffers via this command for a mapped buffer ring */
+ if (bl->buf_nr_pages) {
+ ret = -EINVAL;
+ goto err;
}
ret = io_add_buffers(ctx, p, bl);
@@ -4886,7 +5811,7 @@ err:
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return 0;
}
@@ -4894,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -4940,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
@@ -4964,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4975,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
@@ -5013,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *path;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5051,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
ctx->buffer);
-
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
+ if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5095,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
spin_unlock(&files->file_lock);
goto err;
}
- file = fdt->fd[close->fd];
+ file = rcu_dereference_protected(fdt->fd[close->fd],
+ lockdep_is_held(&files->file_lock));
if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
file = NULL;
@@ -5129,12 +6039,7 @@ err:
static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -5153,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
#if defined(CONFIG_NET)
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
@@ -5205,11 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5218,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -5242,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- flags = req->sr_msg.msg_flags;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5255,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
}
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5275,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
int min_ret = 0;
int ret;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -5288,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_controllen = 0;
msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5301,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5394,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
return __io_recvmsg_copy_hdr(req, iomsg);
}
-static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-}
-
static int io_recvmsg_prep_async(struct io_kiocb *req)
{
int ret;
@@ -5416,12 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->bgid = READ_ONCE(sqe->buf_group);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5434,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static bool io_net_retry(struct socket *sock, int flags)
-{
- if (!(flags & MSG_WAITALL))
- return false;
- return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
- struct io_buffer *kbuf;
+ unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -5464,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- kmsg->fast_iov[0].iov_len = req->sr_msg.len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
- 1, req->sr_msg.len);
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ kmsg->fast_iov[0].iov_base = buf;
+ kmsg->fast_iov[0].iov_len = sr->len;
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+ sr->len);
}
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
- kmsg->uaddr, flags);
+ kmsg->msg.msg_get_inq = 1;
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return io_setup_async_msg(req, kmsg);
@@ -5505,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+ cflags = io_put_kbuf(req, issue_flags);
+ if (kmsg->msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_buffer *kbuf;
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
- void __user *buf = sr->buf;
struct socket *sock;
struct iovec iov;
+ unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- buf = u64_to_user_ptr(kbuf->addr);
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
}
- ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+ ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
msg.msg_name = NULL;
+ msg.msg_namelen = 0;
msg.msg_control = NULL;
+ msg.msg_get_inq = 1;
+ msg.msg_flags = 0;
msg.msg_controllen = 0;
- msg.msg_namelen = 0;
msg.msg_iocb = NULL;
- msg.msg_flags = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5572,36 +6521,49 @@ out_free:
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+ cflags = io_put_kbuf(req, issue_flags);
+ if (msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
+ unsigned flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index)
+ if (sqe->len || sqe->buf_index)
return -EINVAL;
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+ flags = READ_ONCE(sqe->ioprio);
+ if (flags & ~IORING_ACCEPT_MULTISHOT)
+ return -EINVAL;
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
- return -EINVAL;
+ if (accept->file_slot) {
+ if (accept->flags & SOCK_CLOEXEC)
+ return -EINVAL;
+ if (flags & IORING_ACCEPT_MULTISHOT &&
+ accept->file_slot != IORING_FILE_INDEX_ALLOC)
+ return -EINVAL;
+ }
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+ if (flags & IORING_ACCEPT_MULTISHOT)
+ req->flags |= REQ_F_APOLL_MULTISHOT;
return 0;
}
static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -5609,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
+retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5620,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock)
+ if (ret == -EAGAIN && force_nonblock) {
+ /*
+ * if it's multishot and polled, we don't need to
+ * return EAGAIN to arm the poll infra since it
+ * has already been done
+ */
+ if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+ IO_APOLL_MULTI_POLLED)
+ ret = 0;
+ return ret;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ accept->file_slot);
+ }
+
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+ }
+ if (ret >= 0) {
+ bool filled;
+
+ spin_lock(&ctx->completion_lock);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
+ IORING_CQE_F_MORE);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ goto retry;
+ }
+ ret = -ECANCELED;
+ }
+
+ return ret;
+}
+
+static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_socket *sock = &req->sock;
+
+ if (sqe->addr || sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+
+ sock->domain = READ_ONCE(sqe->fd);
+ sock->type = READ_ONCE(sqe->off);
+ sock->protocol = READ_ONCE(sqe->len);
+ sock->file_slot = READ_ONCE(sqe->file_index);
+ sock->nofile = rlimit(RLIMIT_NOFILE);
+
+ sock->flags = sock->type & ~SOCK_TYPE_MASK;
+ if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
+ return -EINVAL;
+ if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ return 0;
+}
+
+static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_socket *sock = &req->sock;
+ bool fixed = !!sock->file_slot;
+ struct file *file;
+ int ret, fd;
+
+ if (!fixed) {
+ fd = __get_unused_fd_flags(sock->flags, sock->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -5630,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
ret = fd;
} else {
ret = io_install_fixed_file(req, file, issue_flags,
- accept->file_slot - 1);
+ sock->file_slot - 1);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -5648,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5724,6 +6766,7 @@ IO_NETOP_PREP_ASYNC(sendmsg);
IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
+IO_NETOP_PREP(socket);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
@@ -5774,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
struct hlist_head *list;
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
hlist_add_head(&req->hash_node, list);
}
@@ -5833,22 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req)
rcu_read_unlock();
}
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
*
* Returns a negative error on failure. >0 when no action require, which is
* either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->result.
+ * the request, then the mask is stored in req->cqe.res.
*/
-static int io_poll_check_events(struct io_kiocb *req, bool locked)
+static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- int v;
+ int v, ret;
/* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
- io_poll_mark_cancelled(req);
+ return -ECANCELED;
do {
v = atomic_read(&req->poll_refs);
@@ -5859,32 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
if (v & IO_POLL_CANCEL_FLAG)
return -ECANCELED;
- if (!req->result) {
+ if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
if (unlikely(!io_assign_file(req, flags)))
return -EBADF;
- req->result = vfs_poll(req->file, &pt) & req->apoll_events;
+ req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
}
- /* multishot, just fill an CQE and proceed */
- if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
- __poll_t mask = mangle_poll(req->result & req->apoll_events);
+ if ((unlikely(!req->cqe.res)))
+ continue;
+ if (req->apoll_events & EPOLLONESHOT)
+ return 0;
+
+ /* multishot, just fill a CQE and proceed */
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __poll_t mask = mangle_poll(req->cqe.res &
+ req->apoll_events);
bool filled;
spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->user_data, mask,
- IORING_CQE_F_MORE);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
+ mask, IORING_CQE_F_MORE);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
- if (unlikely(!filled))
- return -ECANCELED;
- io_cqring_ev_posted(ctx);
- } else if (req->result) {
- return 0;
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ continue;
+ }
+ return -ECANCELED;
}
+ io_tw_lock(req->ctx, locked);
+ if (unlikely(req->task->flags & PF_EXITING))
+ return -EFAULT;
+ ret = io_issue_sqe(req,
+ IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (ret)
+ return ret;
+
/*
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
@@ -5899,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
if (!ret) {
- req->result = mangle_poll(req->result & req->poll.events);
+ req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
} else {
- req->result = ret;
+ req->cqe.res = ret;
req_set_fail(req);
}
io_poll_remove_entries(req);
spin_lock(&ctx->completion_lock);
hash_del(&req->hash_node);
- __io_req_complete_post(req, req->result, 0);
+ __io_req_complete_post(req, req->cqe.res, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -5924,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
@@ -5939,9 +6997,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
+static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
{
- req->result = mask;
+ req->cqe.res = mask;
/*
* This is useful for poll that is armed on behalf of another
* request, and where the wakeup path could be on a different
@@ -5954,11 +7012,12 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
else
req->io_task_work.func = io_apoll_task_func;
- trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
- io_req_task_work_add(req, false);
+ trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
+ io_req_task_work_add(req);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+static inline void io_poll_execute(struct io_kiocb *req, int res,
+ __poll_t events)
{
if (io_poll_get_ownership(req))
__io_poll_execute(req, res, events);
@@ -5973,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)
#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
@@ -6007,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
}
/* for instances that support it check for an event match first */
- if (mask && !(mask & poll->events))
+ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
return 0;
if (io_poll_get_ownership(req)) {
@@ -6093,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
int v;
INIT_HLIST_NODE(&req->hash_node);
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
@@ -6163,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+ __poll_t mask = POLLPRI | POLLERR;
int ret;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
- if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ if (!file_can_poll(req->file))
+ return IO_APOLL_ABORTED;
+ if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT))
+ mask |= EPOLLONESHOT;
if (def->pollin) {
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
if ((req->opcode == IORING_OP_RECVMSG) &&
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~POLLIN;
+ mask &= ~EPOLLIN;
} else {
- mask |= POLLOUT | POLLWRNORM;
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (def->poll_exclusive)
mask |= EPOLLEXCLUSIVE;
- if (!(issue_flags & IO_URING_F_UNLOCKED) &&
- !list_empty(&ctx->apoll_cache)) {
+ if (req->flags & REQ_F_POLLED) {
+ apoll = req->apoll;
+ } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
poll.wait.entry);
list_del_init(&apoll->poll.wait.entry);
@@ -6204,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
+ trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
@@ -6237,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
return found;
}
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
+ struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
{
struct hlist_head *list;
struct io_kiocb *req;
- list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
+ list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
hlist_for_each_entry(req, list, hash_node) {
- if (sqe_addr != req->user_data)
+ if (cd->data != req->cqe.user_data)
continue;
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
continue;
+ if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ }
return req;
}
return NULL;
}
+static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
+ struct io_cancel_data *cd)
+ __must_hold(&ctx->completion_lock)
+{
+ struct io_kiocb *req;
+ int i;
+
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+ hlist_for_each_entry(req, list, hash_node) {
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+ req->file != cd->file)
+ continue;
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ return req;
+ }
+ }
+ return NULL;
+}
+
static bool io_poll_disarm(struct io_kiocb *req)
__must_hold(&ctx->completion_lock)
{
@@ -6265,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req)
return true;
}
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
+static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
{
- struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
+ struct io_kiocb *req;
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
+ req = io_poll_file_find(ctx, cd);
+ else
+ req = io_poll_find(ctx, false, cd);
if (!req)
return -ENOENT;
io_poll_cancel_req(req);
@@ -6297,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
struct io_poll_update *upd = &req->poll_update;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -6329,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
struct io_poll_iocb *poll = &req->poll;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+ if (sqe->buf_index || sqe->off || sqe->addr)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~IORING_POLL_ADD_MULTI)
@@ -6361,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *preq;
int ret2, ret = 0;
bool locked;
spin_lock(&ctx->completion_lock);
- preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+ preq = io_poll_find(ctx, true, &cd);
if (!preq || !io_poll_disarm(preq)) {
spin_unlock(&ctx->completion_lock);
ret = preq ? -EALREADY : -ENOENT;
@@ -6383,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
preq->poll.events |= IO_POLL_UNMASK;
}
if (req->poll_update.update_user_data)
- preq->user_data = req->poll_update.new_user_data;
+ preq->cqe.user_data = req->poll_update.new_user_data;
ret2 = io_poll_add(preq, issue_flags);
/* successfully updated, don't complete poll request */
@@ -6392,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
}
req_set_fail(preq);
- preq->result = -ECANCELED;
+ preq->cqe.res = -ECANCELED;
locked = !(issue_flags & IO_URING_F_UNLOCKED);
io_req_task_complete(preq, &locked);
out:
@@ -6420,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
req_set_fail(req);
- req->result = -ETIME;
+ req->cqe.res = -ETIME;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
- __u64 user_data)
+ struct io_cancel_data *cd)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
@@ -6435,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
bool found = false;
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- found = user_data == req->user_data;
- if (found)
- break;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+ cd->data != req->cqe.user_data)
+ continue;
+ if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ }
+ found = true;
+ break;
}
if (!found)
return ERR_PTR(-ENOENT);
@@ -6449,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
return req;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
- __must_hold(&ctx->timeout_lock)
{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_kiocb *req;
+
+ spin_lock_irq(&ctx->timeout_lock);
+ req = io_timeout_extract(ctx, cd);
+ spin_unlock_irq(&ctx->timeout_lock);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -6486,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
bool found = false;
list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
- found = user_data == req->user_data;
+ found = user_data == req->cqe.user_data;
if (found)
break;
}
@@ -6506,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_cancel_data cd = { .data = user_data, };
+ struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout_data *data;
if (IS_ERR(req))
@@ -6526,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
struct io_timeout_rem *tr = &req->timeout_rem;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
tr->ltimeout = false;
@@ -6571,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret;
if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+ struct io_cancel_data cd = { .data = tr->addr, };
+
spin_lock(&ctx->completion_lock);
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, tr->addr);
- spin_unlock_irq(&ctx->timeout_lock);
+ ret = io_timeout_cancel(ctx, &cd);
spin_unlock(&ctx->completion_lock);
} else {
enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
@@ -6600,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned flags;
u32 off = READ_ONCE(sqe->off);
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
- sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
@@ -6702,30 +7804,42 @@ add:
return 0;
}
-struct io_cancel_data {
- struct io_ring_ctx *ctx;
- u64 user_data;
-};
-
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
- return req->ctx == cd->ctx && req->user_data == cd->user_data;
+ if (req->ctx != cd->ctx)
+ return false;
+ if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+ ;
+ } else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->file != cd->file)
+ return false;
+ } else {
+ if (req->cqe.user_data != cd->data)
+ return false;
+ }
+ if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+ if (cd->seq == req->work.cancel_seq)
+ return false;
+ req->work.cancel_seq = cd->seq;
+ }
+ return true;
}
-static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
- struct io_ring_ctx *ctx)
+static int io_async_cancel_one(struct io_uring_task *tctx,
+ struct io_cancel_data *cd)
{
- struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
enum io_wq_cancel cancel_ret;
int ret = 0;
+ bool all;
if (!tctx || !tctx->io_wq)
return -ENOENT;
- cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
+ all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -6741,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
return ret;
}
-static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
+static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
- ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
+ ret = io_async_cancel_one(req->task->io_uring, cd);
/*
* Fall-through even for -EALREADY, as we may have poll armed
* that need unarming.
@@ -6757,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
return 0;
spin_lock(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, sqe_addr, false);
+ ret = io_poll_cancel(ctx, cd);
if (ret != -ENOENT)
goto out;
-
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, sqe_addr);
- spin_unlock_irq(&ctx->timeout_lock);
+ if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
+ ret = io_timeout_cancel(ctx, cd);
out:
spin_unlock(&ctx->completion_lock);
return ret;
}
+#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
+ IORING_ASYNC_CANCEL_ANY)
+
static int io_async_cancel_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->splice_fd_in)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
+ req->cancel.flags = READ_ONCE(sqe->cancel_flags);
+ if (req->cancel.flags & ~CANCEL_FLAGS)
+ return -EINVAL;
+ if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
+ return -EINVAL;
+ req->cancel.fd = READ_ONCE(sqe->fd);
+ }
+
return 0;
}
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+ unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
- u64 sqe_addr = req->cancel.addr;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+ struct io_ring_ctx *ctx = cd->ctx;
struct io_tctx_node *node;
- int ret;
+ int ret, nr = 0;
- ret = io_try_cancel_userdata(req, sqe_addr);
- if (ret != -ENOENT)
- goto done;
+ do {
+ ret = io_try_cancel(req, cd);
+ if (ret == -ENOENT)
+ break;
+ if (!all)
+ return ret;
+ nr++;
+ } while (1);
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
- ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
- if (ret != -ENOENT)
- break;
+ ret = io_async_cancel_one(tctx, cd);
+ if (ret != -ENOENT) {
+ if (!all)
+ break;
+ nr++;
+ }
}
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
+ return all ? nr : ret;
+}
+
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_cancel_data cd = {
+ .ctx = req->ctx,
+ .data = req->cancel.addr,
+ .flags = req->cancel.flags,
+ .seq = atomic_inc_return(&req->ctx->cancel_seq),
+ };
+ int ret;
+
+ if (cd.flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->flags & REQ_F_FIXED_FILE)
+ req->file = io_file_get_fixed(req, req->cancel.fd,
+ issue_flags);
+ else
+ req->file = io_file_get_normal(req, req->cancel.fd);
+ if (!req->file) {
+ ret = -EBADF;
+ goto done;
+ }
+ cd.file = req->file;
+ }
+
+ ret = __io_async_cancel(&cd, req, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
@@ -6819,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -6833,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6844,10 +7999,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.resv = 0;
up.resv2 = 0;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
@@ -6859,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
case IORING_OP_NOP:
- return 0;
+ return io_nop_prep(req, sqe);
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
@@ -6933,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_linkat_prep(req, sqe);
case IORING_OP_MSG_RING:
return io_msg_ring_prep(req, sqe);
+ case IORING_OP_FSETXATTR:
+ return io_fsetxattr_prep(req, sqe);
+ case IORING_OP_SETXATTR:
+ return io_setxattr_prep(req, sqe);
+ case IORING_OP_FGETXATTR:
+ return io_fgetxattr_prep(req, sqe);
+ case IORING_OP_GETXATTR:
+ return io_getxattr_prep(req, sqe);
+ case IORING_OP_SOCKET:
+ return io_socket_prep(req, sqe);
+ case IORING_OP_URING_CMD:
+ return io_uring_cmd_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6942,7 +8109,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_req_prep_async(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_async_setup)
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
+ /* assign early for deferred execution for non-fixed file */
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ req->file = io_file_get_normal(req, req->cqe.fd);
+ if (!def->needs_async_setup)
return 0;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
@@ -6960,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req)
return io_recvmsg_prep_async(req);
case IORING_OP_CONNECT:
return io_connect_prep_async(req);
+ case IORING_OP_URING_CMD:
+ return io_uring_cmd_prep_async(req);
}
printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
req->opcode);
@@ -6969,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req)
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
+ struct io_kiocb *cur;
/* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(req, req)
+ io_for_each_link(cur, req)
seq--;
return seq;
}
@@ -7014,7 +8189,7 @@ fail:
goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
+ trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
@@ -7076,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req)
if (req->statx.filename)
putname(req->statx.filename);
break;
+ case IORING_OP_SETXATTR:
+ case IORING_OP_FSETXATTR:
+ case IORING_OP_GETXATTR:
+ case IORING_OP_FGETXATTR:
+ __io_xattr_finish(req);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -7098,15 +8279,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
return true;
if (req->flags & REQ_F_FIXED_FILE)
- req->file = io_file_get_fixed(req, req->fd, issue_flags);
+ req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
else
- req->file = io_file_get_normal(req, req->fd);
- if (req->file)
- return true;
+ req->file = io_file_get_normal(req, req->cqe.fd);
- req_set_fail(req);
- req->result = -EBADF;
- return false;
+ return !!req->file;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@ -7236,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_MSG_RING:
ret = io_msg_ring(req, issue_flags);
break;
+ case IORING_OP_FSETXATTR:
+ ret = io_fsetxattr(req, issue_flags);
+ break;
+ case IORING_OP_SETXATTR:
+ ret = io_setxattr(req, issue_flags);
+ break;
+ case IORING_OP_FGETXATTR:
+ ret = io_fgetxattr(req, issue_flags);
+ break;
+ case IORING_OP_GETXATTR:
+ ret = io_getxattr(req, issue_flags);
+ break;
+ case IORING_OP_SOCKET:
+ ret = io_socket(req, issue_flags);
+ break;
+ case IORING_OP_URING_CMD:
+ ret = io_uring_cmd(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -7269,7 +8464,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
const struct io_op_def *def = &io_op_defs[req->opcode];
unsigned int issue_flags = IO_URING_F_UNLOCKED;
bool needs_poll = false;
- struct io_kiocb *timeout;
int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
@@ -7278,10 +8472,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
else
req_ref_get(req);
- timeout = io_prep_linked_timeout(req);
- if (timeout)
- io_queue_linked_timeout(timeout);
-
+ io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
@@ -7314,6 +8505,8 @@ fail:
* wait for request slots on the block side.
*/
if (!needs_poll) {
+ if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+ break;
cond_resched();
continue;
}
@@ -7359,8 +8552,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
struct file *file = NULL;
unsigned long file_ptr;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
goto out;
@@ -7371,9 +8563,9 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
io_req_set_rsrc_node(req, ctx, 0);
+ WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
out:
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return file;
}
@@ -7394,7 +8586,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
- trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
+ trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
/* we don't allow fixed io_uring files */
if (file && file->f_op == &io_uring_fops)
@@ -7408,8 +8600,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
int ret = -ENOENT;
if (prev) {
- if (!(req->task->flags & PF_EXITING))
- ret = io_try_cancel_userdata(req, prev->user_data);
+ if (!(req->task->flags & PF_EXITING)) {
+ struct io_cancel_data cd = {
+ .ctx = req->ctx,
+ .data = prev->cqe.user_data,
+ };
+
+ ret = io_try_cancel(req, &cd);
+ }
io_req_complete_post(req, ret ?: -ETIME, 0);
io_put_req(prev);
} else {
@@ -7443,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@@ -7469,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+static void io_queue_async(struct io_kiocb *req, int ret)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *linked_timeout;
+
+ if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+ io_req_complete_failed(req, ret);
+ return;
+ }
+
+ linked_timeout = io_prep_linked_timeout(req);
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
@@ -7483,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
- io_queue_async_work(req, NULL);
+ io_queue_iowq(req, NULL);
break;
case IO_APOLL_OK:
break;
@@ -7493,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
io_queue_linked_timeout(linked_timeout);
}
-static inline void __io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout;
int ret;
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
@@ -7509,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret)) {
- linked_timeout = io_prep_linked_timeout(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- io_queue_sqe_arm_apoll(req);
- } else {
- io_req_complete_failed(req, ret);
- }
+ if (likely(!ret))
+ io_arm_ltimeout(req);
+ else
+ io_queue_async(req, ret);
}
static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
+ if (unlikely(req->flags & REQ_F_FAIL)) {
+ /*
+ * We don't submit, fail them all, for that replace hardlinks
+ * with normal links. Extra REQ_F_LINK is tolerated.
+ */
+ req->flags &= ~REQ_F_HARDLINK;
+ req->flags |= REQ_F_LINK;
+ io_req_complete_failed(req, req->cqe.res);
} else if (unlikely(req->ctx->drain_active)) {
io_drain_req(req);
} else {
@@ -7533,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
if (unlikely(ret))
io_req_complete_failed(req, ret);
else
- io_queue_async_work(req, NULL);
+ io_queue_iowq(req, NULL);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
- __io_queue_sqe(req);
- else
- io_queue_sqe_fallback(req);
-}
-
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7600,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->user_data = READ_ONCE(sqe->user_data);
+ req->cqe.user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
- req->fixed_rsrc_refs = NULL;
+ req->rsrc_node = NULL;
req->task = current;
if (unlikely(opcode >= IORING_OP_LAST)) {
@@ -7613,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
return -EINVAL;
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[opcode].buffer_select)
- return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_BUFFER_SELECT) {
+ if (!io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ req->buf_index = READ_ONCE(sqe->buf_group);
+ }
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
@@ -7638,10 +8836,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
}
+ if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+ return -EINVAL;
+ if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
if (io_op_defs[opcode].needs_file) {
struct io_submit_state *state = &ctx->submit_state;
- req->fd = READ_ONCE(sqe->fd);
+ req->cqe.fd = READ_ONCE(sqe->fd);
/*
* Plug now if we have more than 2 IO left after this, and the
@@ -7673,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
return io_req_prep(req, sqe);
}
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
+ struct io_kiocb *req, int ret)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_link *link = &ctx->submit_state.link;
+ struct io_kiocb *head = link->head;
+
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
+
+ /*
+ * Avoid breaking links in the middle as it renders links with SQPOLL
+ * unusable. Instead of failing eagerly, continue assembling the link if
+ * applicable and mark the head with REQ_F_FAIL. The link flushing code
+ * should find the flag and handle the rest.
+ */
+ req_fail_link_node(req, ret);
+ if (head && !(head->flags & REQ_F_FAIL))
+ req_fail_link_node(head, -ECANCELED);
+
+ if (!(req->flags & IO_REQ_LINK_FLAGS)) {
+ if (head) {
+ link->last->link = req;
+ link->head = NULL;
+ req = head;
+ }
+ io_queue_sqe_fallback(req);
+ return ret;
+ }
+
+ if (head)
+ link->last->link = req;
+ else
+ link->head = req;
+ link->last = req;
+ return 0;
+}
+
+static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
@@ -7681,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
int ret;
ret = io_init_req(ctx, req, sqe);
- if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ctx, req, ret);
-
- /* fail even hard links since we don't submit */
- if (link->head) {
- /*
- * we can judge a link req is failed or cancelled by if
- * REQ_F_FAIL is set, but the head is an exception since
- * it may be set REQ_F_FAIL because of other req's failure
- * so let's leverage req->result to distinguish if a head
- * is set REQ_F_FAIL because of its failure or other req's
- * failure so that we can set the correct ret code for it.
- * init result here to avoid affecting the normal path.
- */
- if (!(link->head->flags & REQ_F_FAIL))
- req_fail_link_node(link->head, -ECANCELED);
- } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- /*
- * the current req is a normal req, we should return
- * error and thus break the submittion loop.
- */
- io_req_complete_failed(req, ret);
- return ret;
- }
- req_fail_link_node(req, ret);
- }
+ if (unlikely(ret))
+ return io_submit_fail_init(sqe, req, ret);
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
+ trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7720,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (link->head) {
- struct io_kiocb *head = link->head;
-
- if (!(req->flags & REQ_F_FAIL)) {
- ret = io_req_prep_async(req);
- if (unlikely(ret)) {
- req_fail_link_node(req, ret);
- if (!(head->flags & REQ_F_FAIL))
- req_fail_link_node(head, -ECANCELED);
- }
- }
- trace_io_uring_link(ctx, req, head);
+ if (unlikely(link->head)) {
+ ret = io_req_prep_async(req);
+ if (unlikely(ret))
+ return io_submit_fail_init(sqe, req, ret);
+
+ trace_io_uring_link(ctx, req, link->head);
link->last->link = req;
link->last = req;
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ if (req->flags & IO_REQ_LINK_FLAGS)
return 0;
- /* last request of a link, enqueue the link */
+ /* last request of the link, flush it */
+ req = link->head;
link->head = NULL;
- req = head;
- } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
+ if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
+ goto fallback;
+
+ } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
+ REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
+ if (req->flags & IO_REQ_LINK_FLAGS) {
+ link->head = req;
+ link->last = req;
+ } else {
+fallback:
+ io_queue_sqe_fallback(req);
+ }
return 0;
}
@@ -7757,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- if (state->link.head)
- io_queue_sqe(state->link.head);
+ if (unlikely(state->link.head))
+ io_queue_sqe_fallback(state->link.head);
/* flush only after queuing links as they can generate completions */
io_submit_flush_completions(ctx);
if (state->plug_started)
@@ -7812,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
* though the application is the one updating it.
*/
head = READ_ONCE(ctx->sq_array[sq_idx]);
- if (likely(head < ctx->sq_entries))
+ if (likely(head < ctx->sq_entries)) {
+ /* double index for 128-byte SQEs, twice as long */
+ if (ctx->flags & IORING_SETUP_SQE128)
+ head <<= 1;
return &ctx->sq_sqes[head];
+ }
/* drop invalid entries */
ctx->cq_extra--;
@@ -7826,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
unsigned int entries = io_sqring_entries(ctx);
- int submitted = 0;
+ unsigned int left;
+ int ret;
if (unlikely(!entries))
return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, entries);
- io_get_task_refs(nr);
+ ret = left = min3(nr, ctx->sq_entries, entries);
+ io_get_task_refs(left);
+ io_submit_state_start(&ctx->submit_state, left);
- io_submit_state_start(&ctx->submit_state, nr);
do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- if (unlikely(!io_alloc_req_refill(ctx))) {
- if (!submitted)
- submitted = -EAGAIN;
+ if (unlikely(!io_alloc_req_refill(ctx)))
break;
- }
req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ io_req_add_to_cache(req, ctx);
break;
}
- /* will complete beyond this point, count as submitted */
- submitted++;
- if (io_submit_sqe(ctx, req, sqe)) {
- /*
- * Continue submitting even for sqe failure if the
- * ring was setup with IORING_SETUP_SUBMIT_ALL
- */
- if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
- break;
- }
- } while (submitted < nr);
- if (unlikely(submitted != nr)) {
- int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- int unused = nr - ref_used;
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+ !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
+ left--;
+ break;
+ }
+ } while (--left);
- current->io_uring->cached_refs += unused;
+ if (unlikely(left)) {
+ ret -= left;
+ /* try again if it submitted nothing and can't allocate a req */
+ if (!ret && io_req_cache_empty(ctx))
+ ret = -EAGAIN;
+ current->io_uring->cached_refs += left;
}
io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
-
- return submitted;
+ return ret;
}
static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
@@ -7881,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
- /* Tell userspace we may need a wakeup call */
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
@@ -8013,8 +9217,8 @@ static int io_sq_thread(void *data)
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- io_ring_set_wakeup_flag(ctx);
-
+ atomic_or(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
@@ -8025,7 +9229,7 @@ static int io_sq_thread(void *data)
* Ensure the store of the wakeup flag is not
* reordered with the load of the SQ tail
*/
- smp_mb();
+ smp_mb__after_atomic();
if (io_sqring_entries(ctx)) {
needs_sched = false;
@@ -8039,7 +9243,8 @@ static int io_sq_thread(void *data)
mutex_lock(&sqd->lock);
}
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_clear_wakeup_flag(ctx);
+ atomic_andnot(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
}
finish_wait(&sqd->wait, &wait);
@@ -8049,7 +9254,7 @@ static int io_sq_thread(void *data)
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_set_wakeup_flag(ctx);
+ atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
mutex_unlock(&sqd->lock);
@@ -8089,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
+ if (io_should_wake(iowq) ||
+ test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
@@ -8111,15 +9317,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
ktime_t timeout)
{
int ret;
+ unsigned long check_cq;
/* make sure we run task_work before checking for signals */
ret = io_run_task_work_sig();
if (ret || io_should_wake(iowq))
return ret;
+ check_cq = READ_ONCE(ctx->check_cq);
/* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->check_cq_overflow))
+ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
return 1;
-
+ if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+ return -EBADR;
if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
return -ETIME;
return 1;
@@ -8184,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
- finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
+ finish_wait(&ctx->cq_wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
@@ -8425,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
GFP_KERNEL_ACCOUNT);
- return !!table->files;
+ if (unlikely(!table->files))
+ return false;
+
+ table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
+ if (unlikely(!table->bitmap)) {
+ kvfree(table->files);
+ return false;
+ }
+
+ return true;
}
static void io_free_file_tables(struct io_file_table *table)
{
kvfree(table->files);
+ bitmap_free(table->bitmap);
table->files = NULL;
+ table->bitmap = NULL;
+}
+
+static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
+{
+ WARN_ON_ONCE(test_bit(bit, table->bitmap));
+ __set_bit(bit, table->bitmap);
+ if (bit == table->alloc_hint)
+ table->alloc_hint++;
+}
+
+static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
+{
+ __clear_bit(bit, table->bitmap);
+ table->alloc_hint = bit;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+#if !defined(IO_URING_SCM_ALL)
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file = io_file_from_index(ctx, i);
+
+ if (!file)
+ continue;
+ if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+ continue;
+ io_file_bitmap_clear(&ctx->file_table, i);
+ fput(file);
+ }
+#endif
+
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
struct sock *sock = ctx->ring_sock->sk;
@@ -8444,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
kfree_skb(skb);
}
-#else
- int i;
-
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
-
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
#endif
io_free_file_tables(&ctx->file_table);
io_rsrc_data_free(ctx->file_data);
@@ -8598,107 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
return sqd;
}
-#if defined(CONFIG_UNIX)
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing.
+ * loops in the file referencing. We account only files that can hold other
+ * files because otherwise they can't form a loop and so are not interesting
+ * for GC.
*/
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
{
+#if defined(CONFIG_UNIX)
struct sock *sk = ctx->ring_sock->sk;
+ struct sk_buff_head *head = &sk->sk_receive_queue;
struct scm_fp_list *fpl;
struct sk_buff *skb;
- int i, nr_files;
-
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
+ if (likely(!io_file_need_scm(file)))
+ return 0;
- skb->sk = sk;
+ /*
+ * See if we can merge this file into an existing skb SCM_RIGHTS
+ * file set. If there's no room, fall back to allocating a new skb
+ * and filling it in.
+ */
+ spin_lock_irq(&head->lock);
+ skb = skb_peek(head);
+ if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
+ __skb_unlink(skb, head);
+ else
+ skb = NULL;
+ spin_unlock_irq(&head->lock);
- nr_files = 0;
- fpl->user = get_uid(current_user());
- for (i = 0; i < nr; i++) {
- struct file *file = io_file_from_index(ctx, i + offset);
+ if (!skb) {
+ fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
- if (!file)
- continue;
- fpl->fp[nr_files] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[nr_files]);
- nr_files++;
- }
+ skb = alloc_skb(0, GFP_KERNEL);
+ if (!skb) {
+ kfree(fpl);
+ return -ENOMEM;
+ }
- if (nr_files) {
+ fpl->user = get_uid(current_user());
fpl->max = SCM_MAX_FD;
- fpl->count = nr_files;
+ fpl->count = 0;
+
UNIXCB(skb).fp = fpl;
+ skb->sk = sk;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
-
- for (i = 0; i < nr; i++) {
- struct file *file = io_file_from_index(ctx, i + offset);
-
- if (file)
- fput(file);
- }
- } else {
- kfree_skb(skb);
- free_uid(fpl->user);
- kfree(fpl);
- }
-
- return 0;
-}
-
-/*
- * If UNIX sockets are enabled, fd passing can cause a reference cycle which
- * causes regular reference counting to break down. We rely on the UNIX
- * garbage collection to take care of this problem for us.
- */
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- unsigned left, total;
- int ret = 0;
-
- total = 0;
- left = ctx->nr_user_files;
- while (left) {
- unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
-
- ret = __io_sqe_files_scm(ctx, this_files, total);
- if (ret)
- break;
- left -= this_files;
- total += this_files;
- }
-
- if (!ret)
- return 0;
-
- while (total < ctx->nr_user_files) {
- struct file *file = io_file_from_index(ctx, total);
-
- if (file)
- fput(file);
- total++;
}
- return ret;
-}
-#else
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
+ fpl = UNIXCB(skb).fp;
+ fpl->fp[fpl->count++] = get_file(file);
+ unix_inflight(fpl->user, file);
+ skb_queue_head(head, skb);
+ fput(file);
+#endif
return 0;
}
-#endif
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
@@ -8709,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
struct sk_buff *skb;
int i;
+ if (!io_file_need_scm(file)) {
+ fput(file);
+ return;
+ }
+
__skb_queue_head_init(&list);
/*
@@ -8773,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
list_del(&prsrc->list);
if (prsrc->tag) {
- bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ mutex_lock(&ctx->uring_lock);
- io_ring_submit_lock(ctx, lock_ring);
spin_lock(&ctx->completion_lock);
io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_ring_submit_unlock(ctx, lock_ring);
+
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ mutex_unlock(&ctx->uring_lock);
}
rsrc_data->do_put(ctx, prsrc);
@@ -8835,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (ret)
return ret;
- ret = -ENOMEM;
- if (!io_alloc_file_tables(&ctx->file_table, nr_args))
- goto out_free;
+ if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
+ io_rsrc_data_free(ctx->file_data);
+ ctx->file_data = NULL;
+ return -ENOMEM;
+ }
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ struct io_fixed_file *file_slot;
+
+ if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
- goto out_fput;
+ goto fail;
}
/* allow sparse sets */
- if (fd == -1) {
+ if (!fds || fd == -1) {
ret = -EINVAL;
if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
- goto out_fput;
+ goto fail;
continue;
}
file = fget(fd);
ret = -EBADF;
if (unlikely(!file))
- goto out_fput;
+ goto fail;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -8866,74 +10075,23 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (file->f_op == &io_uring_fops) {
fput(file);
- goto out_fput;
+ goto fail;
}
- io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
- }
-
- ret = io_sqe_files_scm(ctx);
- if (ret) {
- __io_sqe_files_unregister(ctx);
- return ret;
- }
-
- io_rsrc_node_switch(ctx, NULL);
- return ret;
-out_fput:
- for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
+ ret = io_scm_file_account(ctx, file);
+ if (ret) {
fput(file);
- }
- io_free_file_tables(&ctx->file_table);
- ctx->nr_user_files = 0;
-out_free:
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- return ret;
-}
-
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
- int index)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
-
- /*
- * See if we can merge this file into an existing skb SCM_RIGHTS
- * file set. If there's no room, fall back to allocating a new skb
- * and filling it in.
- */
- spin_lock_irq(&head->lock);
- skb = skb_peek(head);
- if (skb) {
- struct scm_fp_list *fpl = UNIXCB(skb).fp;
-
- if (fpl->count < SCM_MAX_FD) {
- __skb_unlink(skb, head);
- spin_unlock_irq(&head->lock);
- fpl->fp[fpl->count] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[fpl->count]);
- fpl->count++;
- spin_lock_irq(&head->lock);
- __skb_queue_head(head, skb);
- } else {
- skb = NULL;
+ goto fail;
}
- }
- spin_unlock_irq(&head->lock);
-
- if (skb) {
- fput(file);
- return 0;
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
- return __io_sqe_files_scm(ctx, 1, index);
-#else
+ io_rsrc_node_switch(ctx, NULL);
return 0;
-#endif
+fail:
+ __io_sqe_files_unregister(ctx);
+ return ret;
}
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
@@ -8957,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8988,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
if (ret)
goto err;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- ret = io_sqe_file_register(ctx, file, slot_index);
- if (ret) {
- file_slot->file_ptr = 0;
- goto err;
+ ret = io_scm_file_account(ctx, file);
+ if (!ret) {
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
}
-
- ret = 0;
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
if (ret)
fput(file);
return ret;
@@ -9013,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
int ret;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -9041,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
goto out;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
@@ -9090,6 +10245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
}
if (fd != -1) {
@@ -9111,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, i) = tag;
- io_fixed_file_set(file_slot, file);
- err = io_sqe_file_register(ctx, file, i);
+ err = io_scm_file_account(ctx, file);
if (err) {
- file_slot->file_ptr = 0;
fput(file);
break;
}
+ *io_get_tag_slot(data, i) = tag;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
}
@@ -9198,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
+ INIT_WQ_LIST(&tctx->prio_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -9376,8 +10532,8 @@ static void *io_mem_alloc(size_t size)
return (void *) __get_free_pages(gfp, get_order(size));
}
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
- size_t *sq_offset)
+static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+ unsigned int cq_entries, size_t *sq_offset)
{
struct io_rings *rings;
size_t off, sq_array_size;
@@ -9385,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ if (check_shl_overflow(off, 1, &off))
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@@ -9546,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
+static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
+ int *npages)
{
- struct io_mapped_ubuf *imu = NULL;
+ unsigned long start, end, nr_pages;
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
- unsigned long off, start, end, ubuf;
- size_t size;
- int ret, pret, nr_pages, i;
+ int i, pret, ret = -ENOMEM;
- if (!iov->iov_base) {
- *pimu = ctx->dummy_ubuf;
- return 0;
- }
-
- ubuf = (unsigned long) iov->iov_base;
- end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- *pimu = NULL;
- ret = -ENOMEM;
-
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto done;
@@ -9579,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (!vmas)
goto done;
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
- if (!imu)
- goto done;
-
ret = 0;
mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
@@ -9600,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
break;
}
}
+ *npages = nr_pages;
} else {
ret = pret < 0 ? pret : -EFAULT;
}
@@ -9613,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unpin_user_pages(pages, pret);
goto done;
}
+ ret = 0;
+done:
+ kvfree(vmas);
+ if (ret < 0) {
+ kvfree(pages);
+ pages = ERR_PTR(ret);
+ }
+ return pages;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf **pimu,
+ struct page **last_hpage)
+{
+ struct io_mapped_ubuf *imu = NULL;
+ struct page **pages = NULL;
+ unsigned long off;
+ size_t size;
+ int ret, nr_pages, i;
+
+ if (!iov->iov_base) {
+ *pimu = ctx->dummy_ubuf;
+ return 0;
+ }
+
+ *pimu = NULL;
+ ret = -ENOMEM;
+
+ pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ pages = NULL;
+ goto done;
+ }
+
+ imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ if (!imu)
+ goto done;
- ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
if (ret) {
- unpin_user_pages(pages, pret);
+ unpin_user_pages(pages, nr_pages);
goto done;
}
- off = ubuf & ~PAGE_MASK;
+ off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
@@ -9633,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
size -= vec_len;
}
/* store original address for later verification */
- imu->ubuf = ubuf;
- imu->ubuf_end = ubuf + iov->iov_len;
+ imu->ubuf = (unsigned long) iov->iov_base;
+ imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
@@ -9642,7 +10826,6 @@ done:
if (ret)
kvfree(imu);
kvfree(pages);
- kvfree(vmas);
return ret;
}
@@ -9701,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- break;
- ret = io_buffer_validate(&iov);
- if (ret)
- break;
+ if (arg) {
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+ ret = io_buffer_validate(&iov);
+ if (ret)
+ break;
+ } else {
+ memset(&iov, 0, sizeof(iov));
+ }
+
if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
@@ -9845,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
+ struct io_buffer_list *bl;
+ unsigned long index;
int i;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
- struct list_head *list = &ctx->io_buffers[i];
-
- while (!list_empty(list)) {
- struct io_buffer_list *bl;
+ for (i = 0; i < BGID_ARRAY; i++) {
+ if (!ctx->io_bl)
+ break;
+ __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
+ }
- bl = list_first_entry(list, struct io_buffer_list, list);
- __io_remove_buffers(ctx, bl, -1U);
- list_del(&bl->list);
- kfree(bl);
- }
+ xa_for_each(&ctx->io_bl_xa, index, bl) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ __io_remove_buffers(ctx, bl, -1U);
}
while (!list_empty(&ctx->io_buffers_pages)) {
@@ -9877,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
io_flush_cached_locked_reqs(ctx, state);
- while (state->free_list.next) {
+ while (!io_req_cache_empty(ctx)) {
struct io_wq_work_node *node;
struct io_kiocb *req;
@@ -9966,7 +11154,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -9997,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
+ if (io_cqring_events(ctx) ||
+ test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -10129,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
}
}
spin_unlock_irq(&ctx->timeout_lock);
- if (canceled != 0)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (canceled != 0)
io_cqring_ev_posted(ctx);
@@ -10150,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL, true);
- io_poll_remove_all(ctx, NULL, true);
-
- /* if we failed setting up the ctx, we might not have any rings */
- io_iopoll_try_reap_events(ctx);
+ /* failed during ring init, it couldn't have issued any requests */
+ if (ctx->rings) {
+ io_kill_timeouts(ctx, NULL, true);
+ io_poll_remove_all(ctx, NULL, true);
+ /* if we failed setting up the ctx, we might not have any rings */
+ io_iopoll_try_reap_events(ctx);
+ }
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
@@ -10246,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
+ /* failed during ring init, it couldn't have issued any requests */
+ if (!ctx->rings)
+ return;
+
while (1) {
enum io_wq_cancel cret;
bool ret = false;
@@ -10588,7 +11783,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
ret = -EFAULT;
break;
}
- if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
ret = -EINVAL;
break;
}
@@ -10691,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
return 0;
}
+static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+{
+ if (flags & IORING_ENTER_EXT_ARG) {
+ struct io_uring_getevents_arg arg;
+
+ if (argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
struct __kernel_timespec __user **ts,
const sigset_t __user **sig)
@@ -10728,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
- int submitted = 0;
struct fd f;
long ret;
@@ -10791,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ret)
goto out;
}
- submitted = to_submit;
+ ret = to_submit;
} else if (to_submit) {
ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;
- mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
- if (submitted != to_submit)
+ mutex_lock(&ctx->uring_lock);
+ ret = io_submit_sqes(ctx, to_submit);
+ if (ret != to_submit) {
+ mutex_unlock(&ctx->uring_lock);
goto out;
+ }
+ if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
+ goto iopoll_locked;
+ mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
- const sigset_t __user *sig;
- struct __kernel_timespec __user *ts;
-
- ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
- if (unlikely(ret))
- goto out;
+ int ret2;
+ if (ctx->syscall_iopoll) {
+ /*
+ * We disallow the app entering submit/complete with
+ * polling, but we still need to lock the ring to
+ * prevent racing with polled issue that got punted to
+ * a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+iopoll_locked:
+ ret2 = io_validate_ext_arg(flags, argp, argsz);
+ if (likely(!ret2)) {
+ min_complete = min(min_complete,
+ ctx->cq_entries);
+ ret2 = io_iopoll_check(ctx, min_complete);
+ }
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (likely(!ret2)) {
+ min_complete = min(min_complete,
+ ctx->cq_entries);
+ ret2 = io_cqring_wait(ctx, min_complete, sig,
+ argsz, ts);
+ }
+ }
- min_complete = min(min_complete, ctx->cq_entries);
+ if (!ret) {
+ ret = ret2;
- /*
- * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
- * space applications don't need to do io completion events
- * polling again, they can rely on io_sq_thread to do polling
- * work, which can reduce cpu usage and uring_lock contention.
- */
- if (ctx->flags & IORING_SETUP_IOPOLL &&
- !(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_iopoll_check(ctx, min_complete);
- } else {
- ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
+ /*
+ * EBADR indicates that one or more CQE were dropped.
+ * Once the user has been informed we can clear the bit
+ * as they are obviously ok with those drops.
+ */
+ if (unlikely(ret2 == -EBADR))
+ clear_bit(IO_CHECK_CQ_DROPPED_BIT,
+ &ctx->check_cq);
}
}
@@ -10832,7 +12064,7 @@ out:
out_fput:
if (!(flags & IORING_ENTER_REGISTERED_RING))
fdput(f);
- return submitted ? submitted : ret;
+ return ret;
}
#ifdef CONFIG_PROC_FS
@@ -10879,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int cq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
+ if (is_cqe32)
+ cq_shift = 1;
+
/*
* we may get imprecise sqe and cqe info if uring is actively running
* since we get cached_sq_head and cached_cq_tail without uring_lock
@@ -10915,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
for (i = 0; i < cq_entries; i++) {
unsigned int entry = i + cq_head;
- struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+ struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ if (!is_cqe32) {
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
+ } else {
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
+ "extra1:%llu, extra2:%llu\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
+ }
}
/*
@@ -11022,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+ size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -11037,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+ if (p->flags & IORING_SETUP_SQE128)
+ size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
+ else
+ size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
@@ -11149,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx = io_ring_ctx_alloc(p);
if (!ctx)
return -ENOMEM;
+
+ /*
+ * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+ * space applications don't need to do io completion events
+ * polling again, they can rely on io_sq_thread to do polling
+ * work, which can reduce cpu usage and uring_lock contention.
+ */
+ if (ctx->flags & IORING_SETUP_IOPOLL &&
+ !(ctx->flags & IORING_SETUP_SQPOLL))
+ ctx->syscall_iopoll = 1;
+
ctx->compat = in_compat_syscall();
if (!capable(CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
/*
+ * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
+ * COOP_TASKRUN is set, then IPIs are never needed by the app.
+ */
+ ret = -EINVAL;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ /* IPI related flags don't make sense with SQPOLL */
+ if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
+ IORING_SETUP_TASKRUN_FLAG))
+ goto err;
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else {
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ goto err;
+ ctx->notify_method = TWA_SIGNAL;
+ }
+
+ /*
* This is just grabbed for accounting purposes. When a process exits,
* the mm is exited and dropped before the files, hence we need to hang
* on to this mm purely for the purposes of being able to unaccount
@@ -11250,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+ IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
+ IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
return -EINVAL;
- return io_uring_create(entries, &p, params);
+ return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
@@ -11466,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
memset(&rr, 0, sizeof(rr));
if (copy_from_user(&rr, arg, size))
return -EFAULT;
- if (!rr.nr || rr.resv || rr.resv2)
+ if (!rr.nr || rr.resv2)
+ return -EINVAL;
+ if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
return -EINVAL;
switch (type) {
case IORING_RSRC_FILE:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
case IORING_RSRC_BUFFER:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
}
@@ -11608,6 +12893,85 @@ err:
return ret;
}
+static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_ring *br;
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+ struct page **pages;
+ int nr_pages;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+ if (!reg.ring_addr)
+ return -EFAULT;
+ if (reg.ring_addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (!is_power_of_2(reg.ring_entries))
+ return -EINVAL;
+
+ if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+ int ret = io_init_bl_list(ctx);
+ if (ret)
+ return ret;
+ }
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (bl) {
+ /* if mapped buffer ring OR classic exists, don't allow */
+ if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+ return -EEXIST;
+ } else {
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
+ }
+
+ pages = io_pin_pages(reg.ring_addr,
+ struct_size(br, bufs, reg.ring_entries),
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ kfree(bl);
+ return PTR_ERR(pages);
+ }
+
+ br = page_address(pages[0]);
+ bl->buf_pages = pages;
+ bl->buf_nr_pages = nr_pages;
+ bl->nr_entries = reg.ring_entries;
+ bl->buf_ring = br;
+ bl->mask = reg.ring_entries - 1;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+}
+
+static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->buf_nr_pages)
+ return -EINVAL;
+
+ __io_remove_buffers(ctx, bl, -1U);
+ if (bl->bgid >= BGID_ARRAY) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ kfree(bl);
+ }
+ return 0;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -11633,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
@@ -11642,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
@@ -11736,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_ring(ctx, arg);
+ break;
+ case IORING_UNREGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_unregister_pbuf_ring(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -11812,6 +13194,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
+ BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
@@ -11820,6 +13203,10 @@ static int __init io_uring_init(void)
/* ->buf_index is u16 */
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+ BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
+ offsetof(struct io_uring_buf_ring, tail));
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
@@ -11829,6 +13216,10 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
+ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+
+ BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
+
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8ce8720093b9..d2a9f699e17e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -297,7 +297,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
- * what do_mpage_readpage does.
+ * what do_mpage_read_folio does.
*/
if (!ctx->bio) {
ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
@@ -320,10 +320,8 @@ done:
return pos - orig_pos + plen;
}
-int
-iomap_readpage(struct page *page, const struct iomap_ops *ops)
+int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
{
- struct folio *folio = page_folio(page);
struct iomap_iter iter = {
.inode = folio->mapping->host,
.pos = folio_pos(folio),
@@ -351,13 +349,13 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
/*
- * Just like mpage_readahead and block_read_full_page, we always
- * return 0 and just mark the page as PageError on errors. This
+ * Just like mpage_readahead and block_read_full_folio, we always
+ * return 0 and just set the folio error flag on errors. This
* should be cleaned up throughout the stack eventually.
*/
return 0;
}
-EXPORT_SYMBOL_GPL(iomap_readpage);
+EXPORT_SYMBOL_GPL(iomap_read_folio);
static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
@@ -454,25 +452,23 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
-int
-iomap_releasepage(struct page *page, gfp_t gfp_mask)
+bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct folio *folio = page_folio(page);
-
- trace_iomap_releasepage(folio->mapping->host, folio_pos(folio),
+ trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
folio_size(folio));
/*
- * mm accommodates an old ext3 case where clean pages might not have had
- * the dirty bit cleared. Thus, it can send actual dirty pages to
- * ->releasepage() via shrink_active_list(); skip those here.
+ * mm accommodates an old ext3 case where clean folios might
+ * not have had the dirty bit cleared. Thus, it can send actual
+ * dirty folios to ->release_folio() via shrink_active_list();
+ * skip those here.
*/
if (folio_test_dirty(folio) || folio_test_writeback(folio))
- return 0;
+ return false;
iomap_page_release(folio);
- return 1;
+ return true;
}
-EXPORT_SYMBOL_GPL(iomap_releasepage);
+EXPORT_SYMBOL_GPL(iomap_release_folio);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{
@@ -531,7 +527,8 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
* write started inside the existing inode size.
*/
if (pos + len > i_size)
- truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+ truncate_pagecache_range(inode, max(pos, i_size),
+ pos + len - 1);
}
static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
@@ -663,10 +660,10 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
/*
* The blocks that were entirely written will now be uptodate, so we
- * don't have to worry about a readpage reading them and overwriting a
+ * don't have to worry about a read_folio reading them and overwriting a
* partial write. However, if we've encountered a short write and only
* partially written into a block, it will not be marked uptodate, so a
- * readpage might come in and destroy our partial write.
+ * read_folio might come in and destroy our partial write.
*
* Do the simplest thing and just treat any short write to a
* non-uptodate page as a zero-length write, and force the caller to
@@ -733,7 +730,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
folio_put(folio);
if (ret < len)
- iomap_write_failed(iter->inode, pos, len);
+ iomap_write_failed(iter->inode, pos + ret, len - ret);
return ret;
}
@@ -1386,7 +1383,6 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (wpc->ops->discard_folio)
wpc->ops->discard_folio(folio, pos);
if (!count) {
- folio_clear_uptodate(folio);
folio_unlock(folio);
goto done;
}
@@ -1485,7 +1481,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* Skip the page if it's fully outside i_size, e.g. due to a
* truncate operation that's in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
- * iomap_vm_releasepage() is called on it and gets confused.
+ * iomap_release_folio() is called on it and gets confused.
*
* Note that the end_index is unsigned long. If the given
* offset is greater than 16TB on a 32-bit system then if we
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b08f5dc31780..370c3241618a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -51,12 +51,22 @@ struct iomap_dio {
};
};
+static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
+ struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf)
+{
+ if (dio->dops && dio->dops->bio_set)
+ return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
+ GFP_KERNEL, dio->dops->bio_set);
+ return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
+}
+
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI) {
+ /* Sync dio can't be polled reliably */
+ if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
bio_set_polled(bio, dio->iocb);
dio->submit.poll_bio = bio;
}
@@ -144,7 +154,7 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-static void iomap_dio_bio_end_io(struct bio *bio)
+void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
@@ -176,16 +186,16 @@ static void iomap_dio_bio_end_io(struct bio *bio)
bio_put(bio);
}
}
+EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
- int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@ -265,8 +275,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) &&
- blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
use_fua = true;
}
@@ -311,7 +320,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@ -474,7 +483,7 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -483,6 +492,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DIRECT,
+ .private = private,
};
loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
@@ -654,9 +664,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!dio->submit.poll_bio ||
- !bio_poll(dio->submit.poll_bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -674,11 +682,12 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
+ done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a6689a563c6e..d48868fc40d7 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -80,7 +80,7 @@ DEFINE_EVENT(iomap_range_class, name, \
TP_PROTO(struct inode *inode, loff_t off, u64 len),\
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
-DEFINE_RANGE_EVENT(iomap_releasepage);
+DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index bc12ac7e2312..95a19f25d61c 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -296,8 +296,9 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
* per reference. We inject the additional pages into the page
* cache as a form of readahead.
*/
-static int zisofs_readpage(struct file *file, struct page *page)
+static int zisofs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
int err;
@@ -369,7 +370,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
}
const struct address_space_operations zisofs_aops = {
- .readpage = zisofs_readpage,
+ .read_folio = zisofs_read_folio,
/* No bmap operation supported */
};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d7491692aea3..88bf20303466 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1174,9 +1174,9 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
return sb_bread(inode->i_sb, blknr);
}
-static int isofs_readpage(struct file *file, struct page *page)
+static int isofs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, isofs_get_block);
+ return mpage_read_folio(folio, isofs_get_block);
}
static void isofs_readahead(struct readahead_control *rac)
@@ -1190,7 +1190,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations isofs_aops = {
- .readpage = isofs_readpage,
+ .read_folio = isofs_read_folio,
.readahead = isofs_readahead,
.bmap = _isofs_bmap
};
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4880146babaf..48f58c6c9e69 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,11 +687,12 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
}
/*
- * readpage() for symlinks: reads symlink contents into the page and either
+ * read_folio() for symlinks: reads symlink contents into the folio and either
* makes it uptodate and returns 0 or returns error (-EIO)
*/
-static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
+static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
@@ -804,5 +805,5 @@ error:
}
const struct address_space_operations isofs_symlink_aops = {
- .readpage = rock_ridge_symlink_readpage
+ .read_folio = rock_ridge_symlink_read_folio
};
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ac7f067b7bdd..eb315e81f1a6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -62,6 +62,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
*/
static void release_buffer_page(struct buffer_head *bh)
{
+ struct folio *folio;
struct page *page;
if (buffer_dirty(bh))
@@ -71,18 +72,19 @@ static void release_buffer_page(struct buffer_head *bh)
page = bh->b_page;
if (!page)
goto nope;
- if (page->mapping)
+ folio = page_folio(page);
+ if (folio->mapping)
goto nope;
/* OK, it's a truncated page */
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto nope;
- get_page(page);
+ folio_get(folio);
__brelse(bh);
- try_to_free_buffers(page);
- unlock_page(page);
- put_page(page);
+ try_to_free_buffers(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return;
nope:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fcacafa4510d..c0cbeeaec2d1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1762,7 +1762,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
unsigned long block, log_offset; /* logical */
unsigned long long phys_block, block_start, block_stop; /* physical */
loff_t byte_start, byte_stop, byte_count;
- struct request_queue *q = bdev_get_queue(journal->j_dev);
/* flags must be set to either discard or zeroout */
if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
@@ -1770,10 +1769,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
return -EINVAL;
- if (!q)
- return -ENXIO;
-
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(journal->j_dev))
return -EOPNOTSUPP;
/*
@@ -1828,7 +1825,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
err = blkdev_issue_discard(journal->j_dev,
byte_start >> SECTOR_SHIFT,
byte_count >> SECTOR_SHIFT,
- GFP_NOFS, 0);
+ GFP_NOFS);
} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
err = blkdev_issue_zeroout(journal->j_dev,
byte_start >> SECTOR_SHIFT,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index fcb9175016a5..e49bb0938376 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2143,17 +2143,17 @@ out:
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
*
- * Return 0 on failure, 1 on success
+ * Return false on failure, true on success
*/
-int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
+bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
{
struct buffer_head *head;
struct buffer_head *bh;
- int ret = 0;
+ bool ret = false;
- J_ASSERT(PageLocked(page));
+ J_ASSERT(folio_test_locked(folio));
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
struct journal_head *jh;
@@ -2175,7 +2175,7 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
goto busy;
} while ((bh = bh->b_this_page) != head);
- ret = try_to_free_buffers(page);
+ ret = try_to_free_buffers(folio);
busy:
return ret;
}
@@ -2482,7 +2482,7 @@ int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(&folio->page))
+ if (may_free && try_to_free_buffers(folio))
J_ASSERT(!folio_buffers(folio));
}
return 0;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bd7d58d27bfc..ba86acbe12d3 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -25,9 +25,9 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *pg, void *fsdata);
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
-static int jffs2_readpage (struct file *filp, struct page *pg);
+static int jffs2_read_folio(struct file *filp, struct folio *folio);
int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
@@ -72,7 +72,7 @@ const struct inode_operations jffs2_file_inode_operations =
const struct address_space_operations jffs2_file_address_operations =
{
- .readpage = jffs2_readpage,
+ .read_folio = jffs2_read_folio,
.write_begin = jffs2_write_begin,
.write_end = jffs2_write_end,
};
@@ -110,27 +110,26 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
return ret;
}
-int jffs2_do_readpage_unlock(void *data, struct page *pg)
+int __jffs2_read_folio(struct file *file, struct folio *folio)
{
- int ret = jffs2_do_readpage_nolock(data, pg);
- unlock_page(pg);
+ int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page);
+ folio_unlock(folio);
return ret;
}
-
-static int jffs2_readpage (struct file *filp, struct page *pg)
+static int jffs2_read_folio(struct file *file, struct folio *folio)
{
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(folio->mapping->host);
int ret;
mutex_lock(&f->sem);
- ret = jffs2_do_readpage_unlock(pg->mapping->host, pg);
+ ret = __jffs2_read_folio(file, folio);
mutex_unlock(&f->sem);
return ret;
}
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct page *pg;
@@ -213,7 +212,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* page in read_cache_page(), which causes a deadlock.
*/
mutex_lock(&c->alloc_sem);
- pg = grab_cache_page_write_begin(mapping, index, flags);
+ pg = grab_cache_page_write_begin(mapping, index);
if (!pg) {
ret = -ENOMEM;
goto release_sem;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 71f03a5d36ed..00a110f40e10 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -178,7 +178,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
jffs2_complete_reservation(c);
/* We have to do the truncate_setsize() without f->sem held, since
- some pages may be locked and waiting for it in readpage().
+ some pages may be locked and waiting for it in read_folio().
We are protected from a simultaneous write() extending i_size
back past iattr->ia_size, because do_truncate() holds the
generic inode semaphore. */
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 373b3b7c9f44..5c6602f3c189 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1327,7 +1327,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
* trying to write out, read_cache_page() will not deadlock. */
mutex_unlock(&f->sem);
page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT,
- jffs2_do_readpage_unlock, inode);
+ __jffs2_read_folio, NULL);
if (IS_ERR(page)) {
pr_warn("read_cache_page() returned error: %ld\n",
PTR_ERR(page));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 173eccac691d..921d782583d6 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -155,7 +155,7 @@ extern const struct file_operations jffs2_file_operations;
extern const struct inode_operations jffs2_file_inode_operations;
extern const struct address_space_operations jffs2_file_address_operations;
int jffs2_fsync(struct file *, loff_t, loff_t, int);
-int jffs2_do_readpage_unlock(void *data, struct page *pg);
+int __jffs2_read_folio(struct file *file, struct folio *folio);
/* ioctl.c */
long jffs2_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index d1943a7b4b04..a5dd7e53754a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -293,9 +293,9 @@ static int jfs_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, jfs_get_block);
}
-static int jfs_readpage(struct file *file, struct page *page)
+static int jfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, jfs_get_block);
+ return mpage_read_folio(folio, jfs_get_block);
}
static void jfs_readahead(struct readahead_control *rac)
@@ -314,13 +314,12 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
}
static int jfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
- jfs_get_block);
+ ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, jfs_get_block);
if (unlikely(ret))
jfs_write_failed(mapping, pos + len);
@@ -360,7 +359,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations jfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = jfs_readpage,
+ .read_folio = jfs_read_folio,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
.writepages = jfs_writepages,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 03a845ab4f00..1e7b177ece60 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
s64 ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q)) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
jfs_warn("FITRIM not supported on device");
return -EOPNOTSUPP;
}
@@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = jfs_ioc_trim(inode, &range);
if (ret < 0)
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index c4220ccdedef..387652ae14c2 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -467,8 +467,9 @@ err_out:
return -EIO;
}
-static int metapage_readpage(struct file *fp, struct page *page)
+static int metapage_read_folio(struct file *fp, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
@@ -523,29 +524,29 @@ add_failed:
return -EIO;
}
-static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
+static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask)
{
struct metapage *mp;
- int ret = 1;
+ bool ret = true;
int offset;
for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
- mp = page_to_mp(page, offset);
+ mp = page_to_mp(&folio->page, offset);
if (!mp)
continue;
- jfs_info("metapage_releasepage: mp = 0x%p", mp);
+ jfs_info("metapage_release_folio: mp = 0x%p", mp);
if (mp->count || mp->nohomeok ||
test_bit(META_dirty, &mp->flag)) {
jfs_info("count = %ld, nohomeok = %d", mp->count,
mp->nohomeok);
- ret = 0;
+ ret = false;
continue;
}
if (mp->lsn)
remove_from_logsync(mp);
- remove_metapage(page, mp);
+ remove_metapage(&folio->page, mp);
INCREMENT(mpStat.pagefree);
free_metapage(mp);
}
@@ -559,13 +560,13 @@ static void metapage_invalidate_folio(struct folio *folio, size_t offset,
BUG_ON(folio_test_writeback(folio));
- metapage_releasepage(&folio->page, 0);
+ metapage_release_folio(folio, 0);
}
const struct address_space_operations jfs_metapage_aops = {
- .readpage = metapage_readpage,
+ .read_folio = metapage_read_folio,
.writepage = metapage_writepage,
- .releasepage = metapage_releasepage,
+ .release_folio = metapage_release_folio,
.invalidate_folio = metapage_invalidate_folio,
.dirty_folio = filemap_dirty_folio,
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index f1a13a74cddf..85d4f44f2ac4 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_discard:
- {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
/* if set to 1, even copying files will cause
* trimming :O
* -> user has more control over the online trimming
*/
sbi->minblks_trim = 64;
- if (blk_queue_discard(q))
+ if (bdev_max_discard_sectors(sb->s_bdev))
*flag |= JFS_DISCARD;
else
pr_err("JFS: discard option not supported on device\n");
break;
- }
case Opt_nodiscard:
*flag &= ~JFS_DISCARD;
@@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_discard_minblk:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
char *minblks_trim = args[0].from;
int rc;
- if (blk_queue_discard(q)) {
+ if (bdev_max_discard_sectors(sb->s_bdev)) {
*flag |= JFS_DISCARD;
rc = kstrtouint(minblks_trim, 0,
&sbi->minblks_trim);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 61a8edc4ba8b..e205fde7163a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1406,7 +1406,12 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
diff --git a/fs/libfs.c b/fs/libfs.c
index e64bdedef168..31b0ddf01c31 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -539,17 +539,17 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
EXPORT_SYMBOL(simple_setattr);
-static int simple_readpage(struct file *file, struct page *page)
+static int simple_read_folio(struct file *file, struct folio *folio)
{
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
int simple_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct page *page;
@@ -557,7 +557,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -592,7 +592,7 @@ EXPORT_SYMBOL(simple_write_begin);
* should extend on what's done here with a call to mark_inode_dirty() in the
* case that i_size has changed.
*
- * Use *ONLY* with simple_readpage()
+ * Use *ONLY* with simple_read_folio()
*/
static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -628,7 +628,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
* Provides ramfs-style behavior: data in the pagecache, but no writeback.
*/
const struct address_space_operations ram_aops = {
- .readpage = simple_readpage,
+ .read_folio = simple_read_folio,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
.dirty_folio = noop_dirty_folio,
diff --git a/fs/locks.c b/fs/locks.c
index 8c6df10cd9ed..ca28e0e50e56 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(locks_release_private);
+/**
+ * locks_owner_has_blockers - Check for blocking lock requests
+ * @flctx: file lock context
+ * @owner: lock owner
+ *
+ * Return values:
+ * %true: @owner has at least one blocker
+ * %false: @owner has no blockers
+ */
+bool locks_owner_has_blockers(struct file_lock_context *flctx,
+ fl_owner_t owner)
+{
+ struct file_lock *fl;
+
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_owner != owner)
+ continue;
+ if (!list_empty(&fl->fl_blocked_requests)) {
+ spin_unlock(&flctx->flc_lock);
+ return true;
+ }
+ }
+ spin_unlock(&flctx->flc_lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(locks_owner_has_blockers);
+
/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
@@ -874,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock *cfl;
struct file_lock_context *ctx;
struct inode *inode = locks_inode(filp);
+ void *owner;
+ void (*func)(void);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -881,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
return;
}
+retry:
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
- if (posix_locks_conflict(fl, cfl)) {
- locks_copy_conflock(fl, cfl);
- goto out;
+ if (!posix_locks_conflict(fl, cfl))
+ continue;
+ if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
+ && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
+ owner = cfl->fl_lmops->lm_mod_owner;
+ func = cfl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ (*func)();
+ module_put(owner);
+ goto retry;
}
+ locks_copy_conflock(fl, cfl);
+ goto out;
}
fl->fl_type = F_UNLCK;
out:
@@ -1060,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
int error;
bool added = false;
LIST_HEAD(dispose);
+ void *owner;
+ void (*func)(void);
ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx)
@@ -1078,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
+retry:
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
@@ -1089,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (!posix_locks_conflict(request, fl))
continue;
+ if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
+ && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
+ owner = fl->fl_lmops->lm_mod_owner;
+ func = fl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
+ (*func)();
+ module_put(owner);
+ goto retry;
+ }
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f1a6610e4ee6..da8bdd1712a7 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -402,9 +402,9 @@ static int minix_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, minix_get_block, wbc);
}
-static int minix_readpage(struct file *file, struct page *page)
+static int minix_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,minix_get_block);
+ return block_read_full_folio(folio, minix_get_block);
}
int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -423,13 +423,12 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
}
static int minix_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- minix_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, minix_get_block);
if (unlikely(ret))
minix_write_failed(mapping, pos + len);
@@ -444,7 +443,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations minix_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = minix_readpage,
+ .read_folio = minix_read_folio,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
diff --git a/fs/mpage.c b/fs/mpage.c
index 1fe56f8c495f..0d25f44f5707 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -36,7 +36,7 @@
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -68,7 +68,7 @@ static struct bio *mpage_bio_submit(struct bio *bio)
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
- * the page, which allows readpage to avoid triggering a duplicate call
+ * the page, which allows read_folio to avoid triggering a duplicate call
* to get_block.
*
* The idea is to avoid adding buffers to pages that don't already have
@@ -296,7 +296,7 @@ confused:
if (args->bio)
args->bio = mpage_bio_submit(args->bio);
if (!PageUptodate(page))
- block_read_full_page(page, args->get_block);
+ block_read_full_folio(page_folio(page), args->get_block);
else
unlock_page(page);
goto out;
@@ -364,20 +364,22 @@ EXPORT_SYMBOL(mpage_readahead);
/*
* This isn't called much at all
*/
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
- .page = page,
+ .page = &folio->page,
.nr_pages = 1,
.get_block = get_block,
};
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+
args.bio = do_mpage_readpage(&args);
if (args.bio)
mpage_bio_submit(args.bio);
return 0;
}
-EXPORT_SYMBOL(mpage_readpage);
+EXPORT_SYMBOL(mpage_read_folio);
/*
* Writing is not so simple.
@@ -425,11 +427,11 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
/*
* we cannot drop the bh if the page is not uptodate or a concurrent
- * readpage would fail to serialize with the bh and it would read from
+ * read_folio would fail to serialize with the bh and it would read from
* disk before we reach the platter.
*/
if (buffer_heads_over_limit && PageUptodate(page))
- try_to_free_buffers(page);
+ try_to_free_buffers(page_folio(page));
}
/*
@@ -510,7 +512,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
/*
* Page has buffers, but they are all unmapped. The page was
* created by pagein or read over a hole which was handled by
- * block_read_full_page(). If this address_space is also
+ * block_read_full_folio(). If this address_space is also
* using mpage_readahead then this can rarely happen.
*/
goto confused;
diff --git a/fs/namei.c b/fs/namei.c
index 5e05571bb941..3dc0db2df561 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
+#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
@@ -5001,28 +5002,28 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
}
EXPORT_SYMBOL(page_readlink);
-/*
- * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
- */
-int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
+int page_symlink(struct inode *inode, const char *symname, int len)
{
struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
struct page *page;
void *fsdata;
int err;
- unsigned int flags = 0;
- if (nofs)
- flags |= AOP_FLAG_NOFS;
+ unsigned int flags;
retry:
- err = pagecache_write_begin(NULL, mapping, 0, len-1,
- flags, &page, &fsdata);
+ if (nofs)
+ flags = memalloc_nofs_save();
+ err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
+ if (nofs)
+ memalloc_nofs_restore(flags);
if (err)
goto fail;
memcpy(page_address(page), symname, len-1);
- err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+ err = aops->write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
if (err < 0)
goto fail;
@@ -5034,13 +5035,6 @@ retry:
fail:
return err;
}
-EXPORT_SYMBOL(__page_symlink);
-
-int page_symlink(struct inode *inode, const char *symname, int len)
-{
- return __page_symlink(inode, symname, len,
- !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
-}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
diff --git a/fs/namespace.c b/fs/namespace.c
index afe2b64b14f1..41461f55c039 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4026,8 +4026,9 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
const struct mount *mnt)
{
- return !(kattr->attr_set & MNT_READONLY) ||
- (mnt->mnt.mnt_flags & MNT_READONLY);
+ return (!(kattr->attr_set & MNT_READONLY) ||
+ (mnt->mnt.mnt_flags & MNT_READONLY)) &&
+ !kattr->mnt_userns;
}
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 281a88a5b8dc..8742d22dfd2b 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -198,22 +198,21 @@ cleanup_free:
EXPORT_SYMBOL(netfs_readahead);
/**
- * netfs_readpage - Helper to manage a readpage request
+ * netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
- * @subpage: A subpage of the folio to read
+ * @folio: The folio to read
*
- * Fulfil a readpage request by drawing data from the cache if possible, or the
- * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
- * from different sources will get munged together.
+ * Fulfil a read_folio request by drawing data from the cache if
+ * possible, or the netfs if not. Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
-int netfs_readpage(struct file *file, struct page *subpage)
+int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct folio *folio = page_folio(subpage);
struct address_space *mapping = folio_file_mapping(folio);
struct netfs_io_request *rreq;
struct netfs_i_context *ctx = netfs_i_context(mapping->host);
@@ -245,7 +244,7 @@ alloc_error:
folio_unlock(folio);
return ret;
}
-EXPORT_SYMBOL(netfs_readpage);
+EXPORT_SYMBOL(netfs_read_folio);
/*
* Prepare a folio for writing without reading first
@@ -302,7 +301,6 @@ zero_out:
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
* @len: The length of the write (may extend beyond the end of the folio chosen)
- * @aop_flags: AOP_* flags
* @_folio: Where to put the resultant folio
* @_fsdata: Place for the netfs to store a cookie
*
@@ -329,22 +327,19 @@ zero_out:
* This is usable whether or not caching is enabled.
*/
int netfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int aop_flags,
- struct folio **_folio, void **_fsdata)
+ loff_t pos, unsigned int len, struct folio **_folio,
+ void **_fsdata)
{
struct netfs_io_request *rreq;
struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
struct folio *folio;
- unsigned int fgp_flags;
+ unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
- fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (!folio)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c6b263b5faf1..a8ecdd527662 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *);
static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static void nfs_readdir_clear_array(struct page*);
+static void nfs_readdir_free_folio(struct folio *);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = {
};
const struct address_space_operations nfs_dir_aops = {
- .freepage = nfs_readdir_clear_array,
+ .free_folio = nfs_readdir_free_folio,
};
#define NFS_INIT_DTSIZE PAGE_SIZE
@@ -228,6 +228,11 @@ static void nfs_readdir_clear_array(struct page *page)
kunmap_atomic(array);
}
+static void nfs_readdir_free_folio(struct folio *folio)
+{
+ nfs_readdir_clear_array(&folio->page);
+}
+
static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
u64 change_attr)
{
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 11c566d8769f..4eb2a8380a28 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
}
/**
- * nfs_direct_IO - NFS address space operation for direct I/O
+ * nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
* @iter: I/O buffer
*
- * The presence of this routine in the address space ops vector means
- * the NFS client supports direct I/O. However, for most direct IO, we
- * shunt off direct read and write requests before the VFS gets them,
- * so this method is only ever called for swap.
+ * Perform IO to the swap-file. This is much like direct IO.
*/
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- /* we only support swap file calling nfs_direct_IO */
- if (!IS_SWAPFILE(inode))
- return 0;
+ ssize_t ret;
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter, true);
- return nfs_file_direct_write(iocb, iter, true);
+ ret = nfs_file_direct_read(iocb, iter, true);
+ else
+ ret = nfs_file_direct_write(iocb, iter, true);
+ if (ret < 0)
+ return ret;
+ return 0;
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 150b7fa8f0a7..6f5425e89ca6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp)
return res;
res = nfs_open(inode, filp);
+ if (res == 0)
+ filp->f_mode |= FMODE_CAN_ODIRECT;
return res;
}
@@ -313,7 +315,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page,
* increment the page use counts until he is done with the page.
*/
static int nfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
@@ -325,7 +327,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos);
start:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -337,7 +339,7 @@ start:
} else if (!once_thru &&
nfs_want_read_modify_write(file, page, pos, len)) {
once_thru = 1;
- ret = nfs_readpage(file, page);
+ ret = nfs_read_folio(file, page_folio(page));
put_page(page);
if (!ret)
goto start;
@@ -415,34 +417,31 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
}
/*
- * Attempt to release the private state associated with a page
- * - Called if either PG_private or PG_fscache is set on the page
- * - Caller holds page lock
- * - Return true (may release page) or false (may not)
+ * Attempt to release the private state associated with a folio
+ * - Called if either private or fscache flags are set on the folio
+ * - Caller holds folio lock
+ * - Return true (may release folio) or false (may not)
*/
-static int nfs_release_page(struct page *page, gfp_t gfp)
+static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
{
- dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+ dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio);
- /* If PagePrivate() is set, then the page is not freeable */
- if (PagePrivate(page))
- return 0;
- return nfs_fscache_release_page(page, gfp);
+ /* If the private flag is set, then the folio is not freeable */
+ if (folio_test_private(folio))
+ return false;
+ return nfs_fscache_release_folio(folio, gfp);
}
-static void nfs_check_dirty_writeback(struct page *page,
+static void nfs_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct nfs_inode *nfsi;
- struct address_space *mapping = page_file_mapping(page);
-
- if (!mapping || PageSwapCache(page))
- return;
+ struct address_space *mapping = folio->mapping;
/*
- * Check if an unstable page is currently being committed and
- * if so, have the VM treat it as if the page is under writeback
- * so it will not block due to pages that will shortly be freeable.
+ * Check if an unstable folio is currently being committed and
+ * if so, have the VM treat it as if the folio is under writeback
+ * so it will not block due to folios that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
if (atomic_read(&nfsi->commit_info.rpcs_out)) {
@@ -451,11 +450,11 @@ static void nfs_check_dirty_writeback(struct page *page,
}
/*
- * If PagePrivate() is set, then the page is not freeable and as the
- * inode is not being committed, it's not going to be cleaned in the
- * near future so treat it as dirty
+ * If the private flag is set, then the folio is not freeable
+ * and as the inode is not being committed, it's not going to
+ * be cleaned in the near future so treat it as dirty
*/
- if (PagePrivate(page))
+ if (folio_test_private(folio))
*dirty = true;
}
@@ -483,6 +482,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
+ int ret;
struct inode *inode = file_inode(file);
struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
@@ -496,13 +496,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
return -EINVAL;
}
- *span = sis->pages;
+ ret = rpc_clnt_swap_activate(clnt);
+ if (ret)
+ return ret;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ if (ret < 0) {
+ rpc_clnt_swap_deactivate(clnt);
+ return ret;
+ }
+ *span = sis->pages;
if (cl->rpc_ops->enable_swap)
cl->rpc_ops->enable_swap(inode);
- return rpc_clnt_swap_activate(clnt);
+ sis->flags |= SWP_FS_OPS;
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
@@ -517,7 +526,7 @@ static void nfs_swap_deactivate(struct file *file)
}
const struct address_space_operations nfs_file_aops = {
- .readpage = nfs_readpage,
+ .read_folio = nfs_read_folio,
.readahead = nfs_readahead,
.dirty_folio = filemap_dirty_folio,
.writepage = nfs_writepage,
@@ -525,8 +534,7 @@ const struct address_space_operations nfs_file_aops = {
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
.invalidate_folio = nfs_invalidate_folio,
- .releasepage = nfs_release_page,
- .direct_IO = nfs_direct_IO,
+ .release_folio = nfs_release_folio,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
@@ -535,6 +543,7 @@ const struct address_space_operations nfs_file_aops = {
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
+ .swap_rw = nfs_swap_rw,
};
/*
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e2d59bb5e6bb..9a16897e8dc6 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (result.negated)
ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
else
- ctx->flags &= NFS_MOUNT_SOFTREVAL;
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
break;
case Opt_posix:
if (result.negated)
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4e980cc04779..2a37af880978 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -48,14 +48,14 @@ extern void nfs_fscache_release_file(struct inode *, struct file *);
extern int __nfs_fscache_read_page(struct inode *, struct page *);
extern void __nfs_fscache_write_page(struct inode *, struct page *);
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- wait_on_page_fscache(page);
- fscache_note_page_release(nfs_i_fscache(page->mapping->host));
- nfs_inc_fscache_stats(page->mapping->host,
+ folio_wait_fscache(folio);
+ fscache_note_page_release(nfs_i_fscache(folio->mapping->host));
+ nfs_inc_fscache_stats(folio->mapping->host,
NFSIOS_FSCACHE_PAGES_UNCACHED);
}
return true;
@@ -129,9 +129,9 @@ static inline void nfs_fscache_open_file(struct inode *inode,
struct file *filp) {}
static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- return 1; /* True: may release page */
+ return true; /* may release folio */
}
static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 16106f805ffa..a79f66432bd3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata)
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5e7657374bc3..5a9b043662e9 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -333,8 +333,9 @@ out:
* - The error flag is set for this page. This happens only when a
* previous async read operation failed.
*/
-int nfs_readpage(struct file *file, struct page *page)
+int nfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct nfs_readdesc desc;
struct inode *inode = page_file_mapping(page)->host;
int ret;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 25ba299fdac2..0e27a2e4e68b 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -26,21 +26,21 @@
* and straight-forward than readdir caching.
*/
-static int nfs_symlink_filler(void *data, struct page *page)
+static int nfs_symlink_filler(struct file *file, struct folio *folio)
{
- struct inode *inode = data;
+ struct inode *inode = folio->mapping->host;
int error;
- error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
+ error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE);
if (error < 0)
goto error;
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
error:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return -EIO;
}
@@ -67,7 +67,7 @@ static const char *nfs_get_link(struct dentry *dentry,
if (err)
return err;
page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
- inode);
+ NULL);
if (IS_ERR(page))
return ERR_CAST(page);
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2c1b027774d4..f172412447f5 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -119,14 +119,14 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
struct inode *inode = nf->nf_inode;
do {
- mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_lock(nfsd_file_fsnotify_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
nfm_mark));
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
if (nfm) {
fsnotify_put_mark(mark);
break;
@@ -134,8 +134,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
/* Avoid soft lockup race with nfsd_file_mark_put() */
fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
fsnotify_put_mark(mark);
- } else
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ } else {
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
+ }
/* allocate a new nfm */
new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
@@ -302,6 +303,8 @@ nfsd_file_put_noref(struct nfsd_file *nf)
void
nfsd_file_put(struct nfsd_file *nf)
{
+ might_sleep();
+
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
nfsd_file_flush(nf);
@@ -678,7 +681,8 @@ nfsd_file_cache_init(void)
goto out_shrinker;
}
- nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(nfsd_file_fsnotify_group)) {
pr_err("nfsd: unable to create fsnotify group: %ld\n",
PTR_ERR(nfsd_file_fsnotify_group));
@@ -897,9 +901,9 @@ nfsd_file_is_cached(struct inode *inode)
return ret;
}
-__be32
-nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+static __be32
+nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf, bool open)
{
__be32 status;
struct net *net = SVC_NET(rqstp);
@@ -994,10 +998,14 @@ open_file:
nfsd_file_gc();
nf->nf_mark = nfsd_file_mark_find_or_create(nf);
- if (nf->nf_mark)
- status = nfsd_open_verified(rqstp, fhp, S_IFREG,
- may_flags, &nf->nf_file);
- else
+ if (nf->nf_mark) {
+ if (open) {
+ status = nfsd_open_verified(rqstp, fhp, may_flags,
+ &nf->nf_file);
+ trace_nfsd_file_open(nf, status);
+ } else
+ status = nfs_ok;
+ } else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
@@ -1017,6 +1025,40 @@ open_file:
goto out;
}
+/**
+ * nfsd_file_acquire - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true);
+}
+
+/**
+ * nfsd_file_create - Get a struct nfsd_file, do not open
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file just created
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false);
+}
+
/*
* Note that fields may be added, removed or reordered in the future. Programs
* scraping this file for info should test the labels to ensure they're
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 435ceab27897..1da0c79a5580 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
+__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
int nfsd_file_cache_stats_open(struct inode *, struct file *);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 936eebd4c56d..981a3a7a6e16 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ext2_fs.h>
#include <linux/magic.h>
+#include <linux/namei.h>
#include "cache.h"
#include "xdr3.h"
@@ -220,17 +221,132 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
}
/*
- * With NFSv3, CREATE processing is a lot easier than with NFSv2.
- * At least in theory; we'll see how it fares in practice when the
- * first reports about SunOS compatibility problems start to pour in...
+ * Implement NFSv3's unchecked, guarded, and exclusive CREATE
+ * semantics for regular files. Except for the created file,
+ * this operation is stateless on the server.
+ *
+ * Upon return, caller must release @fhp and @resfhp.
*/
static __be32
+nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd3_createargs *argp)
+{
+ struct iattr *iap = &argp->attrs;
+ struct dentry *parent, *child;
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(argp->name, argp->len))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+ child = lookup_one_len(argp->name, parent, argp->len);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ u32 *verifier = (u32 *)argp->verf;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ switch (argp->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+ iap->ia_valid &= ATTR_SIZE;
+ goto set_attr;
+ case NFS3_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ break;
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true);
+ if (host_err < 0) {
+ status = nfserrno(host_err);
+ goto out;
+ }
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET | ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+
+out:
+ fh_unlock(fhp);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
+static __be32
nfsd3_proc_create(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- svc_fh *dirfhp, *newfhp = NULL;
- struct iattr *attr;
+ svc_fh *dirfhp, *newfhp;
dprintk("nfsd: CREATE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -239,21 +355,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
dirfhp = fh_copy(&resp->dirfh, &argp->fh);
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
- attr = &argp->attrs;
-
- /* Unfudge the mode bits */
- attr->ia_mode &= ~S_IFMT;
- if (!(attr->ia_valid & ATTR_MODE)) {
- attr->ia_valid |= ATTR_MODE;
- attr->ia_mode = S_IFREG;
- } else {
- attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
- }
- /* Now create the file and set attributes */
- resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, newfhp, argp->createmode,
- (u32 *)argp->verf, NULL, NULL);
+ resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b207c76a873f..3895eb52d2b1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -37,6 +37,8 @@
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/namei.h>
+
#include <linux/sunrpc/addr.h>
#include <linux/nfs_ssc.h>
@@ -235,6 +237,183 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
&resfh->fh_handle);
}
+static inline bool nfsd4_create_is_exclusive(int createmode)
+{
+ return createmode == NFS4_CREATE_EXCLUSIVE ||
+ createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+static __be32
+nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child,
+ struct nfsd4_open *open)
+{
+ struct file *filp;
+ struct path path;
+ int oflags;
+
+ oflags = O_CREAT | O_LARGEFILE;
+ switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) {
+ case NFS4_SHARE_ACCESS_WRITE:
+ oflags |= O_WRONLY;
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ oflags |= O_RDWR;
+ break;
+ default:
+ oflags |= O_RDONLY;
+ }
+
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = child;
+ filp = dentry_create(&path, oflags, open->op_iattr.ia_mode,
+ current_cred());
+ if (IS_ERR(filp))
+ return nfserrno(PTR_ERR(filp));
+
+ open->op_filp = filp;
+ return nfs_ok;
+}
+
+/*
+ * Implement NFSv4's unchecked, guarded, and exclusive create
+ * semantics for regular files. Open state for this new file is
+ * subsequently fabricated in nfsd4_process_open2().
+ *
+ * Upon return, caller must release @fhp and @resfhp.
+ */
+static __be32
+nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd4_open *open)
+{
+ struct iattr *iap = &open->op_iattr;
+ struct dentry *parent, *child;
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(open->op_fname, open->op_fnamelen))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+ child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ u32 *verifier = (u32 *)open->op_verf.data;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits. If this
+ * is ever changed to use different attrs for storing the
+ * verifier, then do_open_lookup() will also need to be
+ * fixed accordingly.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+
+ /*
+ * In NFSv4, we don't want to truncate the file
+ * now. This would be wrong if the OPEN fails for
+ * some other reason. Furthermore, if the size is
+ * nonzero, we should ignore it according to spec!
+ */
+ open->op_truncate = (iap->ia_valid & ATTR_SIZE) &&
+ !iap->ia_size;
+ break;
+ case NFS4_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ break; /* subtle */
+ }
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ goto set_attr; /* subtle */
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ status = nfsd4_vfs_create(fhp, child, open);
+ if (status != nfs_ok)
+ goto out;
+ open->op_created = true;
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET|ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+
+out:
+ fh_unlock(fhp);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
{
@@ -264,16 +443,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* yes | yes | GUARDED4 | GUARDED4
*/
- /*
- * Note: create modes (UNCHECKED,GUARDED...) are the same
- * in NFSv4 as in v3 except EXCLUSIVE4_1.
- */
current->fs->umask = open->op_umask;
- status = do_nfsd_create(rqstp, current_fh, open->op_fname,
- open->op_fnamelen, &open->op_iattr,
- *resfh, open->op_createmode,
- (u32 *)open->op_verf.data,
- &open->op_truncate, &open->op_created);
+ status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
current->fs->umask = 0;
if (!status && open->op_label.len)
@@ -284,7 +455,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* use the returned bitmask to indicate which attributes
* we used to store the verifier:
*/
- if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
} else
@@ -375,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(int)open->op_fnamelen, open->op_fname,
open->op_openowner);
+ open->op_filp = NULL;
+
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
@@ -427,43 +600,35 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
switch (open->op_claim_type) {
- case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- case NFS4_OPEN_CLAIM_NULL:
- status = do_open_lookup(rqstp, cstate, open, &resfh);
- if (status)
- goto out;
- break;
- case NFS4_OPEN_CLAIM_PREVIOUS:
- status = nfs4_check_open_reclaim(cstate->clp);
- if (status)
- goto out;
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- reclaim = true;
- fallthrough;
- case NFS4_OPEN_CLAIM_FH:
- case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
- status = do_open_fhandle(rqstp, cstate, open);
- if (status)
- goto out;
- resfh = &cstate->current_fh;
- break;
- case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
- case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- dprintk("NFSD: unsupported OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_notsupp;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_NULL:
+ status = do_open_lookup(rqstp, cstate, open, &resfh);
+ if (status)
goto out;
- default:
- dprintk("NFSD: Invalid OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_inval;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ status = nfs4_check_open_reclaim(cstate->clp);
+ if (status)
+ goto out;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ reclaim = true;
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ status = do_open_fhandle(rqstp, cstate, open);
+ if (status)
goto out;
+ resfh = &cstate->current_fh;
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ status = nfserr_notsupp;
+ goto out;
+ default:
+ status = nfserr_inval;
+ goto out;
}
- /*
- * nfsd4_process_open2() does the actual opening of the file. If
- * successful, it (1) truncates the file if open->op_truncate was
- * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
- */
+
status = nfsd4_process_open2(rqstp, resfh, open);
WARN(status && open->op_created,
"nfsd4_process_open2 failed to open newly-created file! status=%u\n",
@@ -471,6 +636,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (reclaim && !status)
nn->somebody_reclaimed = true;
out:
+ if (open->op_filp) {
+ fput(open->op_filp);
+ open->op_filp = NULL;
+ }
if (resfh && resfh != &cstate->current_fh) {
fh_dup2(&cstate->current_fh, resfh);
fh_put(resfh);
@@ -801,7 +970,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
@@ -2481,11 +2650,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
cstate->minorversion = args->minorversion;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
+
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -2600,7 +2770,7 @@ encode_op:
out:
cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
- set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 234e852fcdfa..9409a0dc1b76 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static struct workqueue_struct *laundry_wq;
+
+int nfsd4_create_laundry_wq(void)
+{
+ int rc = 0;
+
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
+ if (laundry_wq == NULL)
+ rc = -ENOMEM;
+ return rc;
+}
+
+void nfsd4_destroy_laundry_wq(void)
+{
+ destroy_workqueue(laundry_wq);
+}
+
static bool is_session_dead(struct nfsd4_session *ses)
{
return ses->se_flags & NFS4_SESSION_DEAD;
@@ -152,6 +169,7 @@ static __be32 get_client_locked(struct nfs4_client *clp)
if (is_client_expired(clp))
return nfserr_expired;
atomic_inc(&clp->cl_rpc_users);
+ clp->cl_state = NFSD4_ACTIVE;
return nfs_ok;
}
@@ -172,6 +190,7 @@ renew_client_locked(struct nfs4_client *clp)
list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = ktime_get_boottime_seconds();
+ clp->cl_state = NFSD4_ACTIVE;
}
static void put_client_renew_locked(struct nfs4_client *clp)
@@ -690,6 +709,57 @@ static unsigned int file_hashval(struct svc_fh *fh)
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+/*
+ * Check if courtesy clients have conflicting access and resolve it if possible
+ *
+ * access: is op_share_access if share_access is true.
+ * Check if access mode, op_share_access, would conflict with
+ * the current deny mode of the file 'fp'.
+ * access: is op_share_deny if share_access is false.
+ * Check if the deny mode, op_share_deny, would conflict with
+ * current access of the file 'fp'.
+ * stp: skip checking this entry.
+ * new_stp: normal open, not open upgrade.
+ *
+ * Function returns:
+ * false - access/deny mode conflict with normal client.
+ * true - no conflict or conflict with courtesy client(s) is resolved.
+ */
+static bool
+nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp,
+ struct nfs4_ol_stateid *stp, u32 access, bool share_access)
+{
+ struct nfs4_ol_stateid *st;
+ bool resolvable = true;
+ unsigned char bmap;
+ struct nfsd_net *nn;
+ struct nfs4_client *clp;
+
+ lockdep_assert_held(&fp->fi_lock);
+ list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
+ /* ignore lock stateid */
+ if (st->st_openstp)
+ continue;
+ if (st == stp && new_stp)
+ continue;
+ /* check file access against deny mode or vice versa */
+ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap;
+ if (!(access & bmap_to_share_mode(bmap)))
+ continue;
+ clp = st->st_stid.sc_client;
+ if (try_to_expire_client(clp))
+ continue;
+ resolvable = false;
+ break;
+ }
+ if (resolvable) {
+ clp = stp->st_stid.sc_client;
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+ return resolvable;
+}
+
static void
__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
{
@@ -1090,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
get_clnt_odstate(odstate);
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
dp->dl_retries = 1;
+ dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
get_nfs4_file(fp);
@@ -2004,6 +2075,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
idr_init(&clp->cl_stateids);
atomic_set(&clp->cl_rpc_users, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ clp->cl_state = NFSD4_ACTIVE;
+ atomic_set(&clp->cl_delegs_in_recall, 0);
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
@@ -2408,10 +2481,17 @@ static int client_info_show(struct seq_file *m, void *v)
memcpy(&clid, &clp->cl_clientid, sizeof(clid));
seq_printf(m, "clientid: 0x%llx\n", clid);
seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
- if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+
+ if (clp->cl_state == NFSD4_COURTESY)
+ seq_puts(m, "status: courtesy\n");
+ else if (clp->cl_state == NFSD4_EXPIRABLE)
+ seq_puts(m, "status: expirable\n");
+ else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
seq_puts(m, "status: confirmed\n");
else
seq_puts(m, "status: unconfirmed\n");
+ seq_printf(m, "seconds from last renew: %lld\n",
+ ktime_get_boottime_seconds() - clp->cl_time);
seq_printf(m, "name: ");
seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
@@ -4694,9 +4774,18 @@ nfsd_break_deleg_cb(struct file_lock *fl)
bool ret = false;
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfsd_net *nn;
trace_nfsd_cb_recall(&dp->dl_stid);
+ dp->dl_recalled = true;
+ atomic_inc(&clp->cl_delegs_in_recall);
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -4739,9 +4828,14 @@ static int
nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
struct list_head *dispose)
{
- if (arg & F_UNLCK)
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ if (arg & F_UNLCK) {
+ if (dp->dl_recalled)
+ atomic_dec(&clp->cl_delegs_in_recall);
return lease_modify(onlist, arg, dispose);
- else
+ } else
return -EAGAIN;
}
@@ -4947,7 +5041,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
- struct nfsd4_open *open)
+ struct nfsd4_open *open, bool new_stp)
{
struct nfsd_file *nf = NULL;
__be32 status;
@@ -4963,6 +5057,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
*/
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4970,6 +5071,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
/* set access to the file */
status = nfs4_file_get_access(fp, open->op_share_access);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_access, true))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4985,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
- if (status)
- goto out_put_access;
+
+ if (!open->op_filp) {
+ status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ } else {
+ status = nfsd_file_create(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ nf->nf_file = open->op_filp;
+ open->op_filp = NULL;
+ }
+
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
fp->fi_fds[oflag] = nf;
@@ -5016,21 +5134,29 @@ out_put_access:
}
static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+ struct nfsd4_open *open)
{
__be32 status;
unsigned char old_deny_bmap = stp->st_deny_bmap;
if (!test_access(open->op_share_access, stp))
- return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false);
/* test and set deny mode */
spin_lock(&fp->fi_lock);
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status == nfs_ok) {
- set_deny(open->op_share_deny, stp);
- fp->fi_share_deny |=
+ if (status != nfserr_share_denied) {
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ } else {
+ if (nfs4_resolve_deny_conflicts_locked(fp, false,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
+ }
}
spin_unlock(&fp->fi_lock);
@@ -5322,6 +5448,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
*/
}
+/**
+ * nfsd4_process_open2 - finish open processing
+ * @rqstp: the RPC transaction being executed
+ * @current_fh: NFSv4 COMPOUND's current filehandle
+ * @open: OPEN arguments
+ *
+ * If successful, (1) truncate the file if open->op_truncate was
+ * set, (2) set open->op_stateid, (3) set open->op_delegation.
+ *
+ * Returns %nfs_ok on success; otherwise an nfs4stat value in
+ * network byte order is returned.
+ */
__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
@@ -5371,7 +5509,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
goto out;
}
} else {
- status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
if (status) {
stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_open_stateid(stp);
@@ -5605,6 +5743,81 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
}
#endif
+/* Check if any lock belonging to this lockowner has any blockers */
+static bool
+nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
+{
+ struct file_lock_context *ctx;
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_file *nf;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ nf = stp->st_stid.sc_file;
+ ctx = nf->fi_inode->i_flctx;
+ if (!ctx)
+ continue;
+ if (locks_owner_has_blockers(ctx, lo))
+ return true;
+ }
+ return false;
+}
+
+static bool
+nfs4_anylock_blockers(struct nfs4_client *clp)
+{
+ int i;
+ struct nfs4_stateowner *so;
+ struct nfs4_lockowner *lo;
+
+ if (atomic_read(&clp->cl_delegs_in_recall))
+ return true;
+ spin_lock(&clp->cl_lock);
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i],
+ so_strhash) {
+ if (so->so_is_open_owner)
+ continue;
+ lo = lockowner(so);
+ if (nfs4_lockowner_has_blockers(lo)) {
+ spin_unlock(&clp->cl_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&clp->cl_lock);
+ return false;
+}
+
+static void
+nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
+ struct laundry_time *lt)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ INIT_LIST_HEAD(reaplist);
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state == NFSD4_EXPIRABLE)
+ goto exp_client;
+ if (!state_expired(lt, clp->cl_time))
+ break;
+ if (!atomic_read(&clp->cl_rpc_users))
+ clp->cl_state = NFSD4_COURTESY;
+ if (!client_has_state(clp) ||
+ ktime_get_boottime_seconds() >=
+ (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT))
+ goto exp_client;
+ if (nfs4_anylock_blockers(clp)) {
+exp_client:
+ if (!mark_client_expired_locked(clp))
+ list_add(&clp->cl_lru, reaplist);
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
@@ -5627,7 +5840,6 @@ nfs4_laundromat(struct nfsd_net *nn)
goto out;
}
nfsd4_end_grace(nn);
- INIT_LIST_HEAD(&reaplist);
spin_lock(&nn->s2s_cp_lock);
idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
@@ -5637,17 +5849,7 @@ nfs4_laundromat(struct nfsd_net *nn)
_free_cpntf_state_locked(nn, cps);
}
spin_unlock(&nn->s2s_cp_lock);
-
- spin_lock(&nn->client_lock);
- list_for_each_safe(pos, next, &nn->client_lru) {
- clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (!state_expired(&lt, clp->cl_time))
- break;
- if (mark_client_expired_locked(clp))
- continue;
- list_add(&clp->cl_lru, &reaplist);
- }
- spin_unlock(&nn->client_lock);
+ nfs4_get_client_reaplist(nn, &reaplist, &lt);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
trace_nfsd_clid_purged(&clp->cl_clientid);
@@ -5722,7 +5924,6 @@ out:
return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
-static struct workqueue_struct *laundry_wq;
static void laundromat_main(struct work_struct *);
static void
@@ -6551,6 +6752,29 @@ nfsd4_lm_put_owner(fl_owner_t owner)
nfs4_put_stateowner(&lo->lo_owner);
}
+/* return pointer to struct nfs4_client if client is expirable */
+static bool
+nfsd4_lm_lock_expirable(struct file_lock *cfl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfsd_net *nn;
+
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ return true;
+ }
+ return false;
+}
+
+/* schedule laundromat to run immediately and wait for it to complete */
+static void
+nfsd4_lm_expire_lock(void)
+{
+ flush_workqueue(laundry_wq);
+}
+
static void
nfsd4_lm_notify(struct file_lock *fl)
{
@@ -6577,9 +6801,12 @@ nfsd4_lm_notify(struct file_lock *fl)
}
static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_mod_owner = THIS_MODULE,
.lm_notify = nfsd4_lm_notify,
.lm_get_owner = nfsd4_lm_get_owner,
.lm_put_owner = nfsd4_lm_put_owner,
+ .lm_lock_expirable = nfsd4_lm_lock_expirable,
+ .lm_expire_lock = nfsd4_lm_expire_lock,
};
static inline void
@@ -7297,22 +7524,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
return status;
}
+/**
+ * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations
+ * @rqstp: RPC transaction
+ * @cstate: NFSv4 COMPOUND state
+ * @u: RELEASE_LOCKOWNER arguments
+ *
+ * The lockowner's so_count is bumped when a lock record is added
+ * or when copying a conflicting lock. The latter case is brief,
+ * but can lead to fleeting false positives when looking for
+ * locks-in-use.
+ *
+ * Return values:
+ * %nfs_ok: lockowner released or not found
+ * %nfserr_locks_held: lockowner still in use
+ * %nfserr_stale_clientid: clientid no longer active
+ * %nfserr_expired: clientid not recognized
+ */
__be32
nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
clientid_t *clid = &rlockowner->rl_clientid;
- struct nfs4_stateowner *sop;
- struct nfs4_lockowner *lo = NULL;
struct nfs4_ol_stateid *stp;
- struct xdr_netobj *owner = &rlockowner->rl_owner;
- unsigned int hashval = ownerstr_hashval(owner);
- __be32 status;
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_lockowner *lo;
struct nfs4_client *clp;
- LIST_HEAD (reaplist);
+ LIST_HEAD(reaplist);
+ __be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -7320,34 +7561,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
status = set_client(clid, cstate, nn);
if (status)
return status;
-
clp = cstate->clp;
- /* Find the matching lock stateowner */
- spin_lock(&clp->cl_lock);
- list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
- so_strhash) {
-
- if (sop->so_is_open_owner || !same_owner_str(sop, owner))
- continue;
- /* see if there are still any locks associated with it */
- lo = lockowner(sop);
- list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
- if (check_for_locks(stp->st_stid.sc_file, lo)) {
- status = nfserr_locks_held;
- spin_unlock(&clp->cl_lock);
- return status;
- }
- }
-
- nfs4_get_stateowner(sop);
- break;
- }
+ spin_lock(&clp->cl_lock);
+ lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner);
if (!lo) {
spin_unlock(&clp->cl_lock);
- return status;
+ return nfs_ok;
+ }
+ if (atomic_read(&lo->lo_owner.so_count) != 2) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
}
-
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
@@ -7357,11 +7583,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
+
free_ol_stateid_reaplist(&reaplist);
remove_blocked_locks(lo);
nfs4_put_stateowner(&lo->lo_owner);
-
- return status;
+ return nfs_ok;
}
static inline struct nfs4_client_reclaim *
@@ -7602,22 +7828,12 @@ nfs4_state_start(void)
{
int ret;
- laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
- if (laundry_wq == NULL) {
- ret = -ENOMEM;
- goto out;
- }
ret = nfsd4_create_callback_queue();
if (ret)
- goto out_free_laundry;
+ return ret;
set_max_delegations();
return 0;
-
-out_free_laundry:
- destroy_workqueue(laundry_wq);
-out:
- return ret;
}
void
@@ -7654,7 +7870,6 @@ nfs4_state_shutdown_net(struct net *net)
void
nfs4_state_shutdown(void)
{
- destroy_workqueue(laundry_wq);
nfsd4_destroy_callback_queue();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index da92e7d2ab6a..61b2aae81abb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2411,7 +2411,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
return true;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 0b3f12aa37ff..7da88bdc0d6c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -206,7 +206,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct svc_cacherep *rp;
unsigned int i;
- nfsd_reply_cache_stats_destroy(nn);
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
@@ -217,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
rp, nn);
}
}
+ nfsd_reply_cache_stats_destroy(nn);
kvfree(nn->drc_hashtbl);
nn->drc_hashtbl = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 16920e4512bd..0621c2faf242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1535,20 +1535,25 @@ static int __init init_nfsd(void)
retval = create_proc_exports_entry();
if (retval)
goto out_free_lockd;
- retval = register_filesystem(&nfsd_fs_type);
- if (retval)
- goto out_free_exports;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_filesystem;
+ goto out_free_exports;
retval = register_cld_notifier();
if (retval)
+ goto out_free_subsys;
+ retval = nfsd4_create_laundry_wq();
+ if (retval)
+ goto out_free_cld;
+ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
goto out_free_all;
return 0;
out_free_all:
+ nfsd4_destroy_laundry_wq();
+out_free_cld:
+ unregister_cld_notifier();
+out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_filesystem:
- unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
@@ -1566,6 +1571,8 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ unregister_filesystem(&nfsd_fs_type);
+ nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
@@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
- unregister_filesystem(&nfsd_fs_type);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4fc1fd639527..847b482155ae 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -162,6 +162,8 @@ void nfs4_state_shutdown_net(struct net *net);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
+int nfsd4_create_laundry_wq(void);
+void nfsd4_destroy_laundry_wq(void);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -175,6 +177,8 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
{
return false;
}
+static inline int nfsd4_create_laundry_wq(void) { return 0; };
+static inline void nfsd4_destroy_laundry_wq(void) {};
#endif
/*
@@ -336,6 +340,7 @@ void nfsd_lockd_shutdown(void);
#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
+#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
/*
* The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 95457cfd37fc..f3d6313914ed 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -149,6 +149,7 @@ struct nfs4_delegation {
/* For recall: */
int dl_retries;
struct nfsd4_callback dl_recall;
+ bool dl_recalled;
};
#define cb_to_delegation(cb) \
@@ -283,6 +284,28 @@ struct nfsd4_sessionid {
#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
/*
+ * State Meaning Where set
+ * --------------------------------------------------------------------------
+ * | NFSD4_ACTIVE | Confirmed, active | Default |
+ * |------------------- ----------------------------------------------------|
+ * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist |
+ * | | Lease/lock/share | |
+ * | | reservation conflict | |
+ * | | can cause Courtesy | |
+ * | | client to be expired | |
+ * |------------------------------------------------------------------------|
+ * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat |
+ * | | expired by Laundromat| try_to_expire_client |
+ * | | due to conflict | |
+ * |------------------------------------------------------------------------|
+ */
+enum {
+ NFSD4_ACTIVE = 0,
+ NFSD4_COURTESY,
+ NFSD4_EXPIRABLE,
+};
+
+/*
* struct nfs4_client - one per client. Clientids live here.
*
* The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
@@ -385,6 +408,9 @@ struct nfs4_client {
struct list_head async_copies; /* list of async copies */
spinlock_t async_lock; /* lock for async copies */
atomic_t cl_cb_inflight; /* Outstanding callbacks */
+
+ unsigned int cl_state;
+ atomic_t cl_delegs_in_recall;
};
/* struct nfs4_client_reset
@@ -702,4 +728,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+static inline bool try_to_expire_client(struct nfs4_client *clp)
+{
+ cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
+ return clp->cl_state == NFSD4_EXPIRABLE;
+}
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 242fa123e0e9..a60ead3b227a 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -692,12 +692,6 @@ DEFINE_CLID_EVENT(confirmed_r);
/*
* from fs/nfsd/filecache.h
*/
-TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
-TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
-TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
-
#define show_nf_flags(val) \
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
@@ -784,6 +778,34 @@ TRACE_EVENT(nfsd_file_acquire,
__entry->nf_file, __entry->status)
);
+TRACE_EVENT(nfsd_file_open,
+ TP_PROTO(struct nfsd_file *nf, __be32 status),
+ TP_ARGS(nf, status),
+ TP_STRUCT__entry(
+ __field(unsigned int, nf_hashval)
+ __field(void *, nf_inode) /* cannot be dereferenced */
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(void *, nf_file) /* cannot be dereferenced */
+ ),
+ TP_fast_assign(
+ __entry->nf_hashval = nf->nf_hashval;
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
+ __entry->nf_hashval,
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file)
+)
+
DECLARE_EVENT_CLASS(nfsd_file_search_class,
TP_PROTO(struct inode *inode, unsigned int hash, int found),
TP_ARGS(inode, hash, found),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c22ad0532e8e..840e3af63a6f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -827,14 +827,23 @@ retry:
return err;
}
+/**
+ * nfsd_open_verified - Open a regular file for the filecache
+ * @rqstp: RPC request
+ * @fhp: NFS filehandle of the file to open
+ * @may_flags: internal permission flags
+ * @filp: OUT: open "struct file *"
+ *
+ * Returns an nfsstat value in network byte order.
+ */
__be32
-nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
- int may_flags, struct file **filp)
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
+ struct file **filp)
{
__be32 err;
validate_process_creds();
- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
validate_process_creds();
return err;
}
@@ -849,17 +858,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
- struct page **pp = rqstp->rq_next_page;
- struct page *page = buf->page;
- if (rqstp->rq_res.page_len == 0) {
- svc_rqst_replace_page(rqstp, page);
+ svc_rqst_replace_page(rqstp, buf->page);
+ if (rqstp->rq_res.page_len == 0)
rqstp->rq_res.page_base = buf->offset;
- } else if (page != pp[-1]) {
- svc_rqst_replace_page(rqstp, page);
- }
rqstp->rq_res.page_len += sd->len;
-
return sd->len;
}
@@ -1187,14 +1190,26 @@ out:
return err;
}
-static __be32
-nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
- struct iattr *iap)
+/**
+ * nfsd_create_setattr - Set a created file's attributes
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of parent directory
+ * @resfhp: NFS filehandle of new object
+ * @iap: requested attributes of new object
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
+ */
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct iattr *iap)
{
+ __be32 status;
+
/*
- * Mode has already been set earlier in create:
+ * Mode has already been set by file creation.
*/
iap->ia_valid &= ~ATTR_MODE;
+
/*
* Setting uid/gid works only for root. Irix appears to
* send along the gid on create when it tries to implement
@@ -1202,10 +1217,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
*/
if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+
+ /*
+ * Callers expect new file metadata to be committed even
+ * if the attributes have not changed.
+ */
if (iap->ia_valid)
- return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
- /* Callers expect file metadata to be committed here */
- return nfserrno(commit_metadata(resfhp));
+ status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
+ else
+ status = nfserrno(commit_metadata(resfhp));
+
+ /*
+ * Transactional filesystems had a chance to commit changes
+ * for both parent and child simultaneously making the
+ * following commit_metadata a noop in many cases.
+ */
+ if (!status)
+ status = nfserrno(commit_metadata(fhp));
+
+ /*
+ * Update the new filehandle to pick up the new attributes.
+ */
+ if (!status)
+ status = fh_update(resfhp);
+
+ return status;
}
/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1232,7 +1268,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry, *dchild;
struct inode *dirp;
__be32 err;
- __be32 err2;
int host_err;
dentry = fhp->fh_dentry;
@@ -1305,22 +1340,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err < 0)
goto out_nfserr;
- err = nfsd_create_setattr(rqstp, resfhp, iap);
+ err = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
- /*
- * nfsd_create_setattr already committed the child. Transactional
- * filesystems had a chance to commit changes for both parent and
- * child simultaneously making the following commit_metadata a
- * noop.
- */
- err2 = nfserrno(commit_metadata(fhp));
- if (err2)
- err = err2;
- /*
- * Update the file handle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
out:
dput(dchild);
return err;
@@ -1376,172 +1397,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
/*
- * NFSv3 and NFSv4 version of nfsd_create
- */
-__be32
-do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- struct svc_fh *resfhp, int createmode, u32 *verifier,
- bool *truncp, bool *created)
-{
- struct dentry *dentry, *dchild = NULL;
- struct inode *dirp;
- __be32 err;
- int host_err;
- __u32 v_mtime=0, v_atime=0;
-
- err = nfserr_perm;
- if (!flen)
- goto out;
- err = nfserr_exist;
- if (isdotent(fname, flen))
- goto out;
- if (!(iap->ia_valid & ATTR_MODE))
- iap->ia_mode = 0;
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
- if (err)
- goto out;
-
- dentry = fhp->fh_dentry;
- dirp = d_inode(dentry);
-
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
- fh_lock_nested(fhp, I_MUTEX_PARENT);
-
- /*
- * Compose the response file handle.
- */
- dchild = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
-
- /* If file doesn't exist, check for permissions to create one */
- if (d_really_is_negative(dchild)) {
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
- if (err)
- goto out;
- }
-
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* solaris7 gets confused (bugid 4218508) if these have
- * the high bit set, as do xfs filesystems without the
- * "bigtime" feature. So just clear the high bits. If this is
- * ever changed to use different attrs for storing the
- * verifier, then do_open_lookup() will also need to be fixed
- * accordingly.
- */
- v_mtime = verifier[0]&0x7fffffff;
- v_atime = verifier[1]&0x7fffffff;
- }
-
- if (d_really_is_positive(dchild)) {
- err = 0;
-
- switch (createmode) {
- case NFS3_CREATE_UNCHECKED:
- if (! d_is_reg(dchild))
- goto out;
- else if (truncp) {
- /* in nfsv4, we need to treat this case a little
- * differently. we don't want to truncate the
- * file now; this would be wrong if the OPEN
- * fails for some other reason. furthermore,
- * if the size is nonzero, we should ignore it
- * according to spec!
- */
- *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
- }
- else {
- iap->ia_valid &= ATTR_SIZE;
- goto set_attr;
- }
- break;
- case NFS3_CREATE_EXCLUSIVE:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- break;
- }
- fallthrough;
- case NFS4_CREATE_EXCLUSIVE4_1:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- goto set_attr;
- }
- fallthrough;
- case NFS3_CREATE_GUARDED:
- err = nfserr_exist;
- }
- fh_drop_write(fhp);
- goto out;
- }
-
- if (!IS_POSIXACL(dirp))
- iap->ia_mode &= ~current_umask();
-
- host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
- if (host_err < 0) {
- fh_drop_write(fhp);
- goto out_nfserr;
- }
- if (created)
- *created = true;
-
- nfsd_check_ignore_resizing(iap);
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* Cram the verifier into atime/mtime */
- iap->ia_valid = ATTR_MTIME|ATTR_ATIME
- | ATTR_MTIME_SET|ATTR_ATIME_SET;
- /* XXX someone who knows this better please fix it for nsec */
- iap->ia_mtime.tv_sec = v_mtime;
- iap->ia_atime.tv_sec = v_atime;
- iap->ia_mtime.tv_nsec = 0;
- iap->ia_atime.tv_nsec = 0;
- }
-
- set_attr:
- err = nfsd_create_setattr(rqstp, resfhp, iap);
-
- /*
- * nfsd_create_setattr already committed the child
- * (and possibly also the parent).
- */
- if (!err)
- err = nfserrno(commit_metadata(fhp));
-
- /*
- * Update the filehandle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
-
- out:
- fh_unlock(fhp);
- if (dchild && !IS_ERR(dchild))
- dput(dchild);
- fh_drop_write(fhp);
- return err;
-
- out_nfserr:
- err = nfserrno(host_err);
- goto out;
-}
-
-/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
* fits into the buffer. On return, it contains the true length.
* N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ccb87b2864f6..26347d76f44a 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -69,10 +69,8 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
- struct svc_fh *res, int createmode,
- u32 *verifier, bool *truncp, bool *created);
+__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct iattr *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
@@ -88,7 +86,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
int, struct file **);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
@@ -159,10 +157,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
AT_STATX_SYNC_AS_STAT));
}
-static inline int nfsd_create_is_exclusive(int createmode)
-{
- return createmode == NFS3_CREATE_EXCLUSIVE
- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
-}
-
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 846ab6df9d48..7b744011f2d3 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -273,6 +273,7 @@ struct nfsd4_open {
bool op_truncate; /* used during processing */
bool op_created; /* used during processing */
struct nfs4_openowner *op_openowner; /* used during processing */
+ struct file *op_filp; /* used during processing */
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_clnt_odstate *op_odstate; /* used during processing */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6045cea21f52..67f63cfeade5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -63,10 +63,10 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
/**
* nilfs_get_block() - get a file block on the filesystem (callback function)
- * @inode - inode struct of the target file
- * @blkoff - file block number
- * @bh_result - buffer head to be mapped on
- * @create - indicate whether allocating the block or not when it has not
+ * @inode: inode struct of the target file
+ * @blkoff: file block number
+ * @bh_result: buffer head to be mapped on
+ * @create: indicate whether allocating the block or not when it has not
* been allocated yet.
*
* This function does not issue actual read request of the specified data
@@ -140,14 +140,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
}
/**
- * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * nilfs_read_folio() - implement read_folio() method of nilfs_aops {}
* address_space_operations.
- * @file - file struct of the file to be read
- * @page - the page to be read
+ * @file: file struct of the file to be read
+ * @folio: the folio to be read
*/
-static int nilfs_readpage(struct file *file, struct page *page)
+static int nilfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, nilfs_get_block);
+ return mpage_read_folio(folio, nilfs_get_block);
}
static void nilfs_readahead(struct readahead_control *rac)
@@ -248,7 +248,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
}
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
@@ -258,8 +258,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(err))
return err;
- err = block_write_begin(mapping, pos, len, flags, pagep,
- nilfs_get_block);
+ err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block);
if (unlikely(err)) {
nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
@@ -299,13 +298,12 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
- .readpage = nilfs_readpage,
+ .read_folio = nilfs_read_folio,
.writepages = nilfs_writepages,
.dirty_folio = nilfs_dirty_folio,
.readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
- /* .releasepage = nilfs_releasepage, */
.invalidate_folio = block_invalidate_folio,
.direct_IO = nilfs_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1088,6 +1086,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
/**
* nilfs_dirty_inode - reflect changes on given inode to an inode block.
* @inode: inode of the file to be registered.
+ * @flags: flags to determine the dirty state of the inode
*
* nilfs_dirty_inode() loads a inode block containing the specified
* @inode and copies data from a nilfs_inode to a corresponding inode
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fec194a666f4..87e1004b606d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1052,20 +1052,20 @@ out:
static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
- struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
struct fstrim_range range;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(nilfs->ns_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u64, range.minlen,
+ bdev_discard_granularity(nilfs->ns_bdev));
down_read(&nilfs->ns_segctor_sem);
ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 9e2ed76c0f25..0955b657938f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -511,7 +511,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
pos = rb->blkoff << inode->i_blkbits;
err = block_write_begin(inode->i_mapping, pos, blocksize,
- 0, &page, nilfs_get_block);
+ &page, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index e385cca2004a..77ff8e95421f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0) {
put_bh(su_bh);
goto out_sem;
@@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
ndiscarded += nblocks;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index dd48a8f74d57..3b4a079c9617 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0)
return ret;
nblocks = 0;
@@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
return ret;
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 829dd4a61b66..190aa717fa32 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -168,7 +168,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -191,7 +191,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
free = true;
}
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (free)
fsnotify_free_mark(fsn_mark);
@@ -324,7 +324,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
@@ -334,7 +334,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
} else {
error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
if (error) {
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
goto out_err;
}
spin_lock(&new_fsn_mark->lock);
@@ -383,7 +383,7 @@ out:
if (destroy)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (destroy)
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
@@ -401,7 +401,8 @@ static int __init dnotify_init(void)
SLAB_PANIC|SLAB_ACCOUNT);
dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
- dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
+ dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(dnotify_group))
panic("unable to allocate fsnotify group for dnotify\n");
dnotify_sysctl_init();
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 985e995d2a39..4f897e109547 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -319,12 +319,8 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
return 0;
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
-
- /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ /* Apply ignore mask regardless of mark's ISDIR flag */
marks_ignored_mask |= mark->ignored_mask;
/*
@@ -334,14 +330,6 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
continue;
- /*
- * If the event is on a child and this mark is on a parent not
- * watching children, don't send it!
- */
- if (type == FSNOTIFY_ITER_TYPE_PARENT &&
- !(mark->mask & FS_EVENT_ON_CHILD))
- continue;
-
marks_mask |= mark->mask;
/* Record the mark types of this group that matched the event */
@@ -849,16 +837,14 @@ out:
*/
static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
__kernel_fsid_t fsid = {};
- fsnotify_foreach_iter_type(type) {
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
struct fsnotify_mark_connector *conn;
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
-
- conn = READ_ONCE(iter_info->marks[type]->connector);
+ conn = READ_ONCE(mark->connector);
/* Mark is just getting destroyed or created? */
if (!conn)
continue;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index a3d5b751cac5..80e0ec95b113 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -490,3 +490,15 @@ static inline unsigned int fanotify_event_hash_bucket(
{
return event->hash & FANOTIFY_HTABLE_MASK;
}
+
+static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
+{
+ unsigned int mflags = 0;
+
+ if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+ if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
+ mflags |= FAN_MARK_EVICTABLE;
+
+ return mflags;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9b32b76a9c30..c2255b440df9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -264,7 +264,7 @@ static int create_fd(struct fsnotify_group *group, struct path *path,
* originally opened O_WRONLY.
*/
new_file = dentry_open(path,
- group->fanotify_data.f_flags | FMODE_NONOTIFY,
+ group->fanotify_data.f_flags | __FMODE_NONOTIFY,
current_cred());
if (IS_ERR(new_file)) {
/*
@@ -1035,10 +1035,10 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return -ENOENT;
}
@@ -1048,7 +1048,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
fsnotify_recalc_mask(fsn_mark->connector);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
if (destroy_mark)
fsnotify_free_mark(fsn_mark);
@@ -1081,47 +1081,63 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
-static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
+ unsigned int fan_flags)
{
- fsn_mark->ignored_mask |= mask;
+ bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+ bool recalc = false;
/*
* Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
* the removal of the FS_MODIFY bit in calculated mask if it was set
* because of an ignored mask that is now going to survive FS_MODIFY.
*/
- if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
+ (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
if (!(fsn_mark->mask & FS_MODIFY))
- *removed = FS_MODIFY;
+ recalc = true;
}
+
+ if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ return recalc;
+
+ /*
+ * NO_IREF may be removed from a mark, but not added.
+ * When removed, fsnotify_recalc_mask() will take the inode ref.
+ */
+ WARN_ON_ONCE(!want_iref);
+ fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
+
+ return true;
}
-static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int fan_flags)
{
- __u32 oldmask, newmask;
+ bool recalc;
spin_lock(&fsn_mark->lock);
- oldmask = fsnotify_calc_mask(fsn_mark);
- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ if (!(fan_flags & FAN_MARK_IGNORED_MASK))
fsn_mark->mask |= mask;
- } else {
- fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
- }
- newmask = fsnotify_calc_mask(fsn_mark);
+ else
+ fsn_mark->ignored_mask |= mask;
+
+ recalc = fsnotify_calc_mask(fsn_mark) &
+ ~fsnotify_conn_mask(fsn_mark->connector);
+
+ recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
spin_unlock(&fsn_mark->lock);
- return newmask & ~oldmask;
+ return recalc;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
unsigned int obj_type,
+ unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1144,6 +1160,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
}
fsnotify_init_mark(mark, group);
+ if (fan_flags & FAN_MARK_EVICTABLE)
+ mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+
ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
@@ -1170,39 +1189,49 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
- __u32 mask, unsigned int flags,
+ __u32 mask, unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added, removed = 0;
+ bool recalc;
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
+ fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+ fan_flags, fsid);
if (IS_ERR(fsn_mark)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return PTR_ERR(fsn_mark);
}
}
/*
+ * Non evictable mark cannot be downgraded to evictable mark.
+ */
+ if (fan_flags & FAN_MARK_EVICTABLE &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
* Error events are pre-allocated per group, only if strictly
* needed (i.e. FAN_FS_ERROR was requested).
*/
- if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
ret = fanotify_group_init_error_pool(group);
if (ret)
goto out;
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
- if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
+ recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
+ if (recalc)
fsnotify_recalc_mask(fsn_mark->connector);
out:
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
return ret;
@@ -1348,14 +1377,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
return -EINVAL;
- f_flags = O_RDWR | FMODE_NONOTIFY;
+ f_flags = O_RDWR | __FMODE_NONOTIFY;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS);
if (IS_ERR(group)) {
return PTR_ERR(group);
}
@@ -1598,6 +1628,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/*
+ * Evictable is only relevant for inode marks, because only inode object
+ * can be evicted on memory pressure.
+ */
+ if (flags & FAN_MARK_EVICTABLE &&
+ mark_type != FAN_MARK_INODE)
+ goto fput_and_out;
+
+ /*
* Events that do not carry enough information to report
* event->fd require a group that supports reporting fid. Those
* events are not supported on a mount mark, because they do not
@@ -1657,6 +1695,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
+ /*
+ * FAN_RENAME is not allowed on non-dir (for now).
+ * We shouldn't have allowed setting any dirent events in mask of
+ * non-dir, but because we always allowed it, error only if group
+ * was initialized with the new flag FAN_REPORT_TARGET_FID.
+ */
+ ret = -ENOTDIR;
+ if (inode && !S_ISDIR(inode->i_mode) &&
+ ((mask & FAN_RENAME) ||
+ ((mask & FANOTIFY_DIRENT_EVENTS) &&
+ FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID))))
+ goto path_put_and_out;
+
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
@@ -1749,7 +1800,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 57f0d5d9f934..59fb40abe33d 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -14,6 +14,7 @@
#include <linux/exportfs.h>
#include "inotify/inotify.h"
+#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
@@ -28,13 +29,13 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
struct fsnotify_group *group = f->private_data;
struct fsnotify_mark *mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
list_for_each_entry(mark, &group->marks_list, g_list) {
show(m, mark);
if (seq_has_overflowed(m))
break;
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
}
#if defined(CONFIG_EXPORTFS)
@@ -83,16 +84,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(fsnotify_conn_inode(mark->connector));
if (inode) {
- /*
- * IN_ALL_EVENTS represents all of the mask bits
- * that we expose to userspace. There is at
- * least one bit (FS_EVENT_ON_CHILD) which is
- * used only internally to the kernel.
- */
- u32 mask = mark->mask & IN_ALL_EVENTS;
- seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+ seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mask, mark->ignored_mask);
+ inotify_mark_user_mask(mark));
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
@@ -110,12 +104,9 @@ void inotify_show_fdinfo(struct seq_file *m, struct file *f)
static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
{
- unsigned int mflags = 0;
+ unsigned int mflags = fanotify_mark_user_flags(mark);
struct inode *inode;
- if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
- mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
-
if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = igrab(fsnotify_conn_inode(mark->connector));
if (!inode)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 70a8516b78bc..0b3e74935cb4 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -253,7 +253,7 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!inode && !dir))
return 0;
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -290,22 +290,15 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
}
if (parent_mark) {
- /*
- * parent_mark indicates that the parent inode is watching
- * children and interested in this event, which is an event
- * possible on child. But is *this mark* watching children and
- * interested in this event?
- */
- if (parent_mark->mask & FS_EVENT_ON_CHILD) {
- ret = fsnotify_handle_inode_event(group, parent_mark, mask,
- data, data_type, dir, name, 0);
- if (ret)
- return ret;
- }
- if (!inode_mark)
- return 0;
+ ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+ data, data_type, dir, name, 0);
+ if (ret)
+ return ret;
}
+ if (!inode_mark)
+ return 0;
+
if (mask & FS_EVENT_ON_CHILD) {
/*
* Some events can be sent on both parent dir and child marks
@@ -335,31 +328,23 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
struct fsnotify_mark *mark;
int type;
- if (WARN_ON(!iter_info->report_mask))
+ if (!iter_info->report_mask)
return 0;
/* clear ignored on inode modification */
if (mask & FS_MODIFY) {
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- if (mark &&
- !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ if (!(mark->flags &
+ FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
mark->ignored_mask = 0;
}
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- /* does the object mark tell us to do something? */
- if (mark) {
- group = mark->group;
- marks_mask |= mark->mask;
- marks_ignored_mask |= mark->ignored_mask;
- }
+ /* Are any of the group marks interested in this event? */
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ group = mark->group;
+ marks_mask |= mark->mask;
+ marks_ignored_mask |= mark->ignored_mask;
}
pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
@@ -403,11 +388,11 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
/*
* iter_info is a multi head priority queue of marks.
- * Pick a subset of marks from queue heads, all with the
- * same group and set the report_mask for selected subset.
- * Returns the report_mask of the selected subset.
+ * Pick a subset of marks from queue heads, all with the same group
+ * and set the report_mask to a subset of the selected marks.
+ * Returns false if there are no more groups to iterate.
*/
-static unsigned int fsnotify_iter_select_report_types(
+static bool fsnotify_iter_select_report_types(
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *max_prio_group = NULL;
@@ -423,30 +408,48 @@ static unsigned int fsnotify_iter_select_report_types(
}
if (!max_prio_group)
- return 0;
+ return false;
/* Set the report mask for marks from same group as max prio group */
+ iter_info->current_group = max_prio_group;
iter_info->report_mask = 0;
fsnotify_foreach_iter_type(type) {
mark = iter_info->marks[type];
- if (mark &&
- fsnotify_compare_groups(max_prio_group, mark->group) == 0)
+ if (mark && mark->group == iter_info->current_group) {
+ /*
+ * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
+ * is watching children and interested in this event,
+ * which is an event possible on child.
+ * But is *this mark* watching children?
+ */
+ if (type == FSNOTIFY_ITER_TYPE_PARENT &&
+ !(mark->mask & FS_EVENT_ON_CHILD))
+ continue;
+
fsnotify_iter_set_report_type(iter_info, type);
+ }
}
- return iter_info->report_mask;
+ return true;
}
/*
- * Pop from iter_info multi head queue, the marks that were iterated in the
+ * Pop from iter_info multi head queue, the marks that belong to the group of
* current iteration step.
*/
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
+ /*
+ * We cannot use fsnotify_foreach_iter_mark_type() here because we
+ * may need to advance a mark of type X that belongs to current_group
+ * but was not selected for reporting.
+ */
fsnotify_foreach_iter_type(type) {
- if (fsnotify_iter_should_report_type(iter_info, type))
+ mark = iter_info->marks[type];
+ if (mark && mark->group == iter_info->current_group)
iter_info->marks[type] =
fsnotify_next_mark(iter_info->marks[type]);
}
@@ -581,7 +584,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7d4d64f87c2..1de6631a3925 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -112,8 +112,10 @@ void fsnotify_put_group(struct fsnotify_group *group)
EXPORT_SYMBOL_GPL(fsnotify_put_group);
static struct fsnotify_group *__fsnotify_alloc_group(
- const struct fsnotify_ops *ops, gfp_t gfp)
+ const struct fsnotify_ops *ops,
+ int flags, gfp_t gfp)
{
+ static struct lock_class_key nofs_marks_lock;
struct fsnotify_group *group;
group = kzalloc(sizeof(struct fsnotify_group), gfp);
@@ -133,6 +135,17 @@ static struct fsnotify_group *__fsnotify_alloc_group(
INIT_LIST_HEAD(&group->marks_list);
group->ops = ops;
+ group->flags = flags;
+ /*
+ * For most backends, eviction of inode with a mark is not expected,
+ * because marks hold a refcount on the inode against eviction.
+ *
+ * Use a different lockdep class for groups that support evictable
+ * inode marks, because with evictable marks, mark_mutex is NOT
+ * fs-reclaim safe - the mutex is taken when evicting inodes.
+ */
+ if (flags & FSNOTIFY_GROUP_NOFS)
+ lockdep_set_class(&group->mark_mutex, &nofs_marks_lock);
return group;
}
@@ -140,20 +153,15 @@ static struct fsnotify_group *__fsnotify_alloc_group(
/*
* Create a new fsnotify_group and hold a reference for the group returned.
*/
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops,
+ int flags)
{
- return __fsnotify_alloc_group(ops, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
+ gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT :
+ GFP_KERNEL;
-/*
- * Create a new fsnotify_group and hold a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
-{
- return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+ return __fsnotify_alloc_group(ops, flags, gfp);
}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
int fsnotify_fasync(int fd, struct file *file, int on)
{
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e3711916..7d5df7a21539 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,25 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct inotify_event_info, fse);
}
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+ __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK;
+
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK)
+ mask |= IN_EXCL_UNLINK;
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+ mask |= IN_ONESHOT;
+
+ return mask;
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d92d7b0adc9a..49cfe2ae6d23 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -122,7 +122,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
fsnotify_destroy_event(group, fsn_event);
}
- if (inode_mark->mask & IN_ONESHOT)
+ if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
fsnotify_destroy_mark(inode_mark, group);
return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 54583f62dc44..ed42a189faa2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -110,11 +110,26 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
mask |= FS_EVENT_ON_CHILD;
/* mask off the flags used to open the fd */
- mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+ mask |= (arg & INOTIFY_USER_MASK);
return mask;
}
+#define INOTIFY_MARK_FLAGS \
+ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+
+static inline unsigned int inotify_arg_to_flags(u32 arg)
+{
+ unsigned int flags = 0;
+
+ if (arg & IN_EXCL_UNLINK)
+ flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK;
+ if (arg & IN_ONESHOT)
+ flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT;
+
+ return flags;
+}
+
static inline u32 inotify_mask_to_arg(__u32 mask)
{
return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
@@ -526,13 +541,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
struct fsnotify_mark *fsn_mark;
struct inotify_inode_mark *i_mark;
__u32 old_mask, new_mask;
- __u32 mask;
- int add = (arg & IN_MASK_ADD);
+ int replace = !(arg & IN_MASK_ADD);
int create = (arg & IN_MASK_CREATE);
int ret;
- mask = inotify_arg_to_mask(inode, arg);
-
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
@@ -545,10 +557,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
spin_lock(&fsn_mark->lock);
old_mask = fsn_mark->mask;
- if (add)
- fsn_mark->mask |= mask;
- else
- fsn_mark->mask = mask;
+ if (replace) {
+ fsn_mark->mask = 0;
+ fsn_mark->flags &= ~INOTIFY_MARK_FLAGS;
+ }
+ fsn_mark->mask |= inotify_arg_to_mask(inode, arg);
+ fsn_mark->flags |= inotify_arg_to_flags(arg);
new_mask = fsn_mark->mask;
spin_unlock(&fsn_mark->lock);
@@ -579,19 +593,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
u32 arg)
{
struct inotify_inode_mark *tmp_i_mark;
- __u32 mask;
int ret;
struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- mask = inotify_arg_to_mask(inode, arg);
-
tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
if (unlikely(!tmp_i_mark))
return -ENOMEM;
fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
- tmp_i_mark->fsn_mark.mask = mask;
+ tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg);
+ tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg);
tmp_i_mark->wd = -1;
ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
@@ -628,13 +640,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return ret;
}
@@ -644,7 +656,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
struct fsnotify_group *group;
struct inotify_event_info *oevent;
- group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&inotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER);
if (IS_ERR(group))
return group;
@@ -845,9 +858,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
- BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
- BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4853184f7dde..c74ef947447d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -116,20 +116,64 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
return *fsnotify_conn_mask_p(conn);
}
-static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+ ihold(inode);
+ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+/*
+ * Grab or drop inode reference for the connector if needed.
+ *
+ * When it's time to drop the reference, we only clear the HAS_IREF flag and
+ * return the inode object. fsnotify_drop_object() will be resonsible for doing
+ * iput() outside of spinlocks. This happens when last mark that wanted iref is
+ * detached.
+ */
+static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
+ bool want_iref)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ struct inode *inode = NULL;
+
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == has_iref)
+ return NULL;
+
+ if (want_iref) {
+ /* Pin inode if any mark wants inode refcount held */
+ fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
+ conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
+ } else {
+ /* Unpin inode after detach of last mark that wanted iref */
+ inode = fsnotify_conn_inode(conn);
+ conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
+ }
+
+ return inode;
+}
+
+static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
+ bool want_iref = false;
struct fsnotify_mark *mark;
assert_spin_locked(&conn->lock);
/* We can get detached connector here when inode is getting unlinked. */
if (!fsnotify_valid_obj_type(conn->type))
- return;
+ return NULL;
hlist_for_each_entry(mark, &conn->list, obj_list) {
- if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= fsnotify_calc_mask(mark);
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
+ continue;
+ new_mask |= fsnotify_calc_mask(mark);
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
+ !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ want_iref = true;
}
*fsnotify_conn_mask_p(conn) = new_mask;
+
+ return fsnotify_update_iref(conn, want_iref);
}
/*
@@ -169,12 +213,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static void fsnotify_get_inode_ref(struct inode *inode)
-{
- ihold(inode);
- atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
-}
-
static void fsnotify_put_inode_ref(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -213,6 +251,10 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+
+ /* Unpin inode when detaching from connector */
+ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
+ inode = NULL;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
@@ -274,7 +316,8 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
- __fsnotify_recalc_mask(conn);
+ objp = __fsnotify_recalc_mask(conn);
+ type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
@@ -398,9 +441,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
*/
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
- struct fsnotify_group *group = mark->group;
-
- WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(mark->group);
WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
refcount_read(&mark->refcnt) < 1 +
!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
@@ -452,9 +493,9 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
@@ -499,7 +540,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
unsigned int obj_type,
__kernel_fsid_t *fsid)
{
- struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
@@ -507,6 +547,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
return -ENOMEM;
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
+ conn->flags = 0;
conn->type = obj_type;
conn->obj = connp;
/* Cache fsid of filesystem containing the object */
@@ -517,10 +558,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
- inode = fsnotify_conn_inode(conn);
- fsnotify_get_inode_ref(inode);
- }
fsnotify_get_sb_connectors(conn);
/*
@@ -529,8 +566,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- if (inode)
- fsnotify_put_inode_ref(inode);
fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
@@ -574,7 +609,7 @@ out:
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -633,7 +668,7 @@ restart:
if ((lmark->group == mark->group) &&
(lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
- !allow_dups) {
+ !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
err = -EEXIST;
goto out_err;
}
@@ -668,12 +703,12 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(group);
/*
* LOCKING ORDER!!!!
@@ -688,12 +723,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
if (ret)
goto err;
- if (mark->mask || mark->ignored_mask)
- fsnotify_recalc_mask(mark->connector);
+ fsnotify_recalc_mask(mark->connector);
return ret;
err:
@@ -708,15 +742,15 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int obj_type, int allow_dups,
+ unsigned int obj_type, int add_flags,
__kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
- mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_lock(group);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+ fsnotify_group_unlock(group);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);
@@ -770,24 +804,24 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one.
*/
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if (mark->connector->type == obj_type)
list_move(&mark->g_list, &to_free);
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
clear:
while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
if (list_empty(head)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
break;
}
mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 90e3dad8ee45..9e3964ea2ea0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -159,7 +159,7 @@ still_busy:
*
* Return 0 on success and -errno on error.
*
- * Contains an adapted version of fs/buffer.c::block_read_full_page().
+ * Contains an adapted version of fs/buffer.c::block_read_full_folio().
*/
static int ntfs_read_block(struct page *page)
{
@@ -358,16 +358,16 @@ handle_zblock:
}
/**
- * ntfs_readpage - fill a @page of a @file with data from the device
- * @file: open file to which the page @page belongs or NULL
- * @page: page cache page to fill with data
+ * ntfs_read_folio - fill a @folio of a @file with data from the device
+ * @file: open file to which the folio @folio belongs or NULL
+ * @folio: page cache folio to fill with data
*
- * For non-resident attributes, ntfs_readpage() fills the @page of the open
- * file @file by calling the ntfs version of the generic block_read_full_page()
+ * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
+ * file @file by calling the ntfs version of the generic block_read_full_folio()
* function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the page asynchronously.
+ * associated with the folio asynchronously.
*
- * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
+ * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
* data from the mft record (which at this stage is most likely in memory) and
* fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
* even if the mft record is not cached at this point in time, we need to wait
@@ -375,8 +375,9 @@ handle_zblock:
*
* Return 0 on success and -errno on error.
*/
-static int ntfs_readpage(struct file *file, struct page *page)
+static int ntfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
loff_t i_size;
struct inode *vi;
ntfs_inode *ni, *base_ni;
@@ -458,7 +459,7 @@ retry_readpage:
}
/*
* If a parallel write made the attribute non-resident, drop the mft
- * record and retry the readpage.
+ * record and retry the read_folio.
*/
if (unlikely(NInoNonResident(ni))) {
unmap_mft_record(base_ni);
@@ -637,10 +638,11 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely((block >= iblock) &&
(initialized_size < i_size))) {
/*
- * If this page is fully outside initialized size, zero
- * out all pages between the current initialized size
- * and the current page. Just use ntfs_readpage() to do
- * the zeroing transparently.
+ * If this page is fully outside initialized
+ * size, zero out all pages between the current
+ * initialized size and the current page. Just
+ * use ntfs_read_folio() to do the zeroing
+ * transparently.
*/
if (block > iblock) {
// TODO:
@@ -798,7 +800,7 @@ lock_retry_remap:
/* For the error case, need to reset bh to the beginning. */
bh = head;
- /* Just an optimization, so ->readpage() is not called later. */
+ /* Just an optimization, so ->read_folio() is not called later. */
if (unlikely(!PageUptodate(page))) {
int uptodate = 1;
do {
@@ -1329,7 +1331,7 @@ done:
* vfs inode dirty code path for the inode the mft record belongs to or via the
* vm page dirty code path for the page the mft record is in.
*
- * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
*
* Return 0 on success and -errno on error.
*/
@@ -1651,7 +1653,7 @@ hole:
* attributes.
*/
const struct address_space_operations ntfs_normal_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
.dirty_folio = block_dirty_folio,
@@ -1666,7 +1668,7 @@ const struct address_space_operations ntfs_normal_aops = {
* ntfs_compressed_aops - address space operations for compressed inodes
*/
const struct address_space_operations ntfs_compressed_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
.dirty_folio = block_dirty_folio,
@@ -1681,7 +1683,7 @@ const struct address_space_operations ntfs_compressed_aops = {
* and attributes
*/
const struct address_space_operations ntfs_mst_aops = {
- .readpage = ntfs_readpage, /* Fill page with data. */
+ .read_folio = ntfs_read_folio, /* Fill page with data. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
.dirty_folio = filemap_dirty_folio,
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index f0962d46bd67..934d5f79b9e7 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -37,9 +37,9 @@ static inline void ntfs_unmap_page(struct page *page)
* Read a page from the page cache of the address space @mapping at position
* @index, where @index is in units of PAGE_SIZE, and not in bytes.
*
- * If the page is not in memory it is loaded from disk first using the readpage
- * method defined in the address space operations of @mapping and the page is
- * added to the page cache of @mapping in the process.
+ * If the page is not in memory it is loaded from disk first using the
+ * read_folio method defined in the address space operations of @mapping
+ * and the page is added to the page cache of @mapping in the process.
*
* If the page belongs to an mst protected attribute and it is marked as such
* in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 2911c04a33e0..4de597a83b88 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1719,7 +1719,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
vi->i_blocks = ni->allocated_size >> 9;
write_unlock_irqrestore(&ni->size_lock, flags);
/*
- * This needs to be last since the address space operations ->readpage
+ * This needs to be last since the address space operations ->read_folio
* and ->writepage can run concurrently with us as they are not
* serialized on i_mutex. Note, we are not allowed to fail once we flip
* this switch, which is another reason to do this last.
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index d2f9d6a0ee32..a60f543e7557 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -780,12 +780,12 @@ lock_retry_remap:
/* Uncompressed cb, copy it to the destination pages. */
/*
* TODO: As a big optimization, we could detect this case
- * before we read all the pages and use block_read_full_page()
+ * before we read all the pages and use block_read_full_folio()
* on all full pages instead (we still have to treat partial
* pages especially but at least we are getting rid of the
* synchronous io for the majority of pages.
* Or if we choose not to do the read-ahead/-behind stuff, we
- * could just return block_read_full_page(pages[xpage]) as long
+ * could just return block_read_full_folio(pages[xpage]) as long
* as PAGE_SIZE <= cb_size.
*/
if (cb_max_ofs)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 329fca1fa619..a8abe2296514 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -251,14 +251,14 @@ do_non_resident_extend:
*
* TODO: For sparse pages could optimize this workload by using
* the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
- * would be set in readpage for sparse pages and here we would
+ * would be set in read_folio for sparse pages and here we would
* not need to mark dirty any pages which have this bit set.
* The only caveat is that we have to clear the bit everywhere
* where we allocate any clusters that lie in the page or that
* contain the page.
*
* TODO: An even greater optimization would be for us to only
- * call readpage() on pages which are not in sparse regions as
+ * call read_folio() on pages which are not in sparse regions as
* determined from the runlist. This would greatly reduce the
* number of pages we read and make dirty in the case of sparse
* files.
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index efe0602b4e51..db0f1995aedd 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1832,7 +1832,7 @@ int ntfs_read_inode_mount(struct inode *vi)
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /* Provides readpage() for map_mft_record(). */
+ /* Provides read_folio() for map_mft_record(). */
vi->i_mapping->a_ops = &ntfs_mst_aops;
ctx = ntfs_attr_get_search_ctx(ni, m);
@@ -2503,7 +2503,7 @@ retry_truncate:
* between the old data_size, i.e. old_size, and the new_size
* has not been zeroed. Fortunately, we do not need to zero it
* either since on one hand it will either already be zero due
- * to both readpage and writepage clearing partial page data
+ * to both read_folio and writepage clearing partial page data
* beyond i_size in which case there is nothing to do or in the
* case of the file being mmap()ped at the same time, POSIX
* specifies that the behaviour is unspecified thus we do not
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index 17bfefc30271..49c001af16ed 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h
@@ -79,7 +79,7 @@ extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
* paths and via the page cache write back code paths or between writing
* neighbouring mft records residing in the same page.
*
- * Locking the page also serializes us against ->readpage() if the page is not
+ * Locking the page also serializes us against ->read_folio() if the page is not
* uptodate.
*
* On success, clean the mft record and return 0. On error, leave the mft
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 787b53b984ee..a4fcdc7927ca 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
{
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
if (copy_from_user(&range, user_range, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u32, range.minlen,
+ bdev_discard_granularity(sbi->sb->s_bdev));
err = ntfs_trim_fs(sbi, &range);
if (err < 0)
@@ -115,7 +115,6 @@ static int ntfs_extend_initialized_size(struct file *file,
for (;;) {
u32 zerofrom, len;
struct page *page;
- void *fsdata;
u8 bits;
CLST vcn, lcn, clen;
@@ -157,16 +156,14 @@ static int ntfs_extend_initialized_size(struct file *file,
if (pos + len > new_valid)
len = new_valid - pos;
- err = pagecache_write_begin(file, mapping, pos, len, 0, &page,
- &fsdata);
+ err = ntfs_write_begin(file, mapping, pos, len, &page, NULL);
if (err)
goto out;
zero_user_segment(page, zerofrom, PAGE_SIZE);
/* This function in any case puts page. */
- err = pagecache_write_end(file, mapping, pos, len, len, page,
- fsdata);
+ err = ntfs_write_end(file, mapping, pos, len, len, page, NULL);
if (err < 0)
goto out;
pos += len;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 9eab11e3b034..74f60c457f28 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -676,8 +676,9 @@ static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, ntfs_get_block_bmap);
}
-static int ntfs_readpage(struct file *file, struct page *page)
+static int ntfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
int err;
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
@@ -701,7 +702,7 @@ static int ntfs_readpage(struct file *file, struct page *page)
}
/* Normal + sparse files. */
- return mpage_readpage(page, ntfs_get_block);
+ return mpage_read_folio(folio, ntfs_get_block);
}
static void ntfs_readahead(struct readahead_control *rac)
@@ -861,9 +862,8 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
bh_result, create, GET_BLOCK_WRITE_BEGIN);
}
-static int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 flags, struct page **pagep,
- void **fsdata)
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata)
{
int err;
struct inode *inode = mapping->host;
@@ -872,7 +872,7 @@ static int ntfs_write_begin(struct file *file, struct address_space *mapping,
*pagep = NULL;
if (is_resident(ni)) {
struct page *page = grab_cache_page_write_begin(
- mapping, pos >> PAGE_SHIFT, flags);
+ mapping, pos >> PAGE_SHIFT);
if (!page) {
err = -ENOMEM;
@@ -894,7 +894,7 @@ static int ntfs_write_begin(struct file *file, struct address_space *mapping,
goto out;
}
- err = block_write_begin(mapping, pos, len, flags, pagep,
+ err = block_write_begin(mapping, pos, len, pagep,
ntfs_get_block_write_begin);
out:
@@ -904,10 +904,9 @@ out:
/*
* ntfs_write_end - Address_space_operations::write_end.
*/
-static int ntfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 copied, struct page *page,
- void *fsdata)
-
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata)
{
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
@@ -975,7 +974,7 @@ int reset_log_file(struct inode *inode)
len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE;
- err = block_write_begin(mapping, pos, len, 0, &page,
+ err = block_write_begin(mapping, pos, len, &page,
ntfs_get_block_write_begin);
if (err)
goto out;
@@ -1942,7 +1941,7 @@ const struct inode_operations ntfs_link_inode_operations = {
};
const struct address_space_operations ntfs_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
.writepage = ntfs_writepage,
.writepages = ntfs_writepages,
@@ -1954,7 +1953,7 @@ const struct address_space_operations ntfs_aops = {
};
const struct address_space_operations ntfs_aops_cmpr = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
};
// clang-format on
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index fb825059d488..8de129a6419b 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -689,6 +689,11 @@ int ntfs_set_size(struct inode *inode, u64 new_size);
int reset_log_file(struct inode *inode);
int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata);
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index b2b54c4553f9..0c6de6287737 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -886,7 +886,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
- struct request_queue *rq;
struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt;
@@ -916,15 +915,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto out;
}
- rq = bdev_get_queue(bdev);
- if (blk_queue_discard(rq) && rq->limits.discard_granularity) {
- sbi->discard_granularity = rq->limits.discard_granularity;
+ if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) {
+ sbi->discard_granularity = bdev_discard_granularity(bdev);
sbi->discard_granularity_mask_inv =
~(u64)(sbi->discard_granularity - 1);
}
/* Parse boot. */
- err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
+ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev),
bdev_nr_bytes(bdev));
if (err)
goto out;
@@ -1339,7 +1337,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
return 0;
err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (err == -EOPNOTSUPP)
sbi->flags |= NTFS_FLAGS_NODISCARD;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 49f41074baad..51c93929a146 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7427,7 +7427,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
/*
* No need to worry about the data page here - it's been
* truncated already and inline data doesn't need it for
- * pushing zero's to disk, so we'll let readpage pick it up
+ * pushing zero's to disk, so we'll let read_folio pick it up
* later.
*/
if (trunc) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4b9af65cb61b..35d40a67204c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -275,8 +275,9 @@ out:
return ret;
}
-static int ocfs2_readpage(struct file *file, struct page *page)
+static int ocfs2_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
loff_t start = (loff_t)page->index << PAGE_SHIFT;
@@ -309,7 +310,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
/*
* i_size might have just been updated as we grabed the meta lock. We
* might now be discovering a truncate that hit on another node.
- * block_read_full_page->get_block freaks out if it is asked to read
+ * block_read_full_folio->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
* and notice that the page they just read isn't needed.
@@ -326,7 +327,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
ret = ocfs2_readpage_inline(inode, page);
else
- ret = block_read_full_page(page, ocfs2_get_block);
+ ret = block_read_full_folio(page_folio(page), ocfs2_get_block);
unlock = 0;
out_alloc:
@@ -497,11 +498,11 @@ bail:
return status;
}
-static int ocfs2_releasepage(struct page *page, gfp_t wait)
+static bool ocfs2_release_folio(struct folio *folio, gfp_t wait)
{
- if (!page_has_buffers(page))
- return 0;
- return try_to_free_buffers(page);
+ if (!folio_buffers(folio))
+ return false;
+ return try_to_free_buffers(folio);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -1881,7 +1882,7 @@ out:
}
static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
@@ -1897,7 +1898,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
/*
* Take alloc sem here to prevent concurrent lookups. That way
* the mapping, zeroing and tree manipulation within
- * ocfs2_write() will be safe against ->readpage(). This
+ * ocfs2_write() will be safe against ->read_folio(). This
* should also serve to lock out allocation from a shared
* writeable region.
*/
@@ -2454,7 +2455,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations ocfs2_aops = {
.dirty_folio = block_dirty_folio,
- .readpage = ocfs2_readpage,
+ .read_folio = ocfs2_read_folio,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
@@ -2462,7 +2463,7 @@ const struct address_space_operations ocfs2_aops = {
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
.invalidate_folio = block_invalidate_folio,
- .releasepage = ocfs2_releasepage,
+ .release_folio = ocfs2_release_folio,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 01b7407a8893..7497cd592258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2526,7 +2526,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
return -EOPNOTSUPP;
/*
- * buffered reads protect themselves in ->readpage(). O_DIRECT reads
+ * buffered reads protect themselves in ->read_folio(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (direct_io) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f59461d85da4..afd54ec66103 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev),
range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb587..e04358a46b68 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2961,12 +2961,14 @@ retry:
}
if (!PageUptodate(page)) {
- ret = block_read_full_page(page, ocfs2_get_block);
+ struct folio *folio = page_folio(page);
+
+ ret = block_read_full_folio(folio, ocfs2_get_block);
if (ret) {
mlog_errno(ret);
goto unlock;
}
- lock_page(page);
+ folio_lock(folio);
}
if (page_has_buffers(page)) {
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f755a4985821..d4c5fdcfa1e4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -52,8 +52,9 @@
#include "buffer_head_io.h"
-static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
+static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
int status = ocfs2_read_inode_block(inode, &bh);
@@ -81,7 +82,7 @@ static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
}
const struct address_space_operations ocfs2_fast_symlink_aops = {
- .readpage = ocfs2_fast_symlink_readpage,
+ .read_folio = ocfs2_fast_symlink_read_folio,
};
const struct inode_operations ocfs2_symlink_inode_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 3f297b541713..fa7fe2393ff6 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -284,9 +284,9 @@ out:
return ret;
}
-static int omfs_readpage(struct file *file, struct page *page)
+static int omfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, omfs_get_block);
+ return block_read_full_folio(folio, omfs_get_block);
}
static void omfs_readahead(struct readahead_control *rac)
@@ -316,13 +316,12 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
}
static int omfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- omfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block);
if (unlikely(ret))
omfs_write_failed(mapping, pos + len);
@@ -374,7 +373,7 @@ const struct inode_operations omfs_file_inops = {
const struct address_space_operations omfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = omfs_readpage,
+ .read_folio = omfs_read_folio,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
.writepages = omfs_writepages,
diff --git a/fs/open.c b/fs/open.c
index 1315253e0247..be849dcca032 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -834,16 +834,15 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
+ if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
+ f->f_mode |= FMODE_CAN_ODIRECT;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
@@ -981,6 +980,48 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * dentry_create - Create and open a file
+ * @path: path to create
+ * @flags: O_ flags
+ * @mode: mode bits for new file
+ * @cred: credentials to use
+ *
+ * Caller must hold the parent directory's lock, and have prepared
+ * a negative dentry, placed in @path->dentry, for the new file.
+ *
+ * Caller sets @path->mnt to the vfsmount of the filesystem where
+ * the new file is to be created. The parent directory and the
+ * negative dentry must reside on the same filesystem instance.
+ *
+ * On success, returns a "struct file *". Otherwise a ERR_PTR
+ * is returned.
+ */
+struct file *dentry_create(const struct path *path, int flags, umode_t mode,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ validate_creds(cred);
+ f = alloc_empty_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ error = vfs_create(mnt_user_ns(path->mnt),
+ d_inode(path->dentry->d_parent),
+ path->dentry, mode, true);
+ if (!error)
+ error = vfs_open(path, f);
+
+ if (unlikely(error)) {
+ fput(f);
+ return ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL(dentry_create);
+
struct file *open_with_fake_path(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 79c1025d18ea..5ce27dde3c79 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -288,47 +288,45 @@ static void orangefs_readahead(struct readahead_control *rac)
}
}
-static int orangefs_readpage(struct file *file, struct page *page)
+static int orangefs_read_folio(struct file *file, struct folio *folio)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct iov_iter iter;
struct bio_vec bv;
ssize_t ret;
- loff_t off; /* offset into this page */
+ loff_t off; /* offset of this folio in the file */
if (folio_test_dirty(folio))
orangefs_launder_folio(folio);
- off = page_offset(page);
- bv.bv_page = page;
- bv.bv_len = PAGE_SIZE;
+ off = folio_pos(folio);
+ bv.bv_page = &folio->page;
+ bv.bv_len = folio_size(folio);
bv.bv_offset = 0;
- iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
+ iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
- PAGE_SIZE, inode->i_size, NULL, NULL, file);
+ folio_size(folio), inode->i_size, NULL, NULL, file);
/* this will only zero remaining unread portions of the page data */
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
if (ret < 0) {
- SetPageError(page);
+ folio_set_error(folio);
} else {
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
+ folio_mark_uptodate(folio);
+ if (folio_test_error(folio))
+ folio_clear_error(folio);
ret = 0;
}
- /* unlock the page after the ->readpage() routine completes */
- unlock_page(page);
+ /* unlock the folio after the ->read_folio() routine completes */
+ folio_unlock(folio);
return ret;
}
static int orangefs_write_begin(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags, struct page **pagep,
- void **fsdata)
+ struct address_space *mapping, loff_t pos, unsigned len,
+ struct page **pagep, void **fsdata)
{
struct orangefs_write_range *wr;
struct folio *folio;
@@ -338,7 +336,7 @@ static int orangefs_write_begin(struct file *file,
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -487,14 +485,14 @@ static void orangefs_invalidate_folio(struct folio *folio,
orangefs_launder_folio(folio);
}
-static int orangefs_releasepage(struct page *page, gfp_t foo)
+static bool orangefs_release_folio(struct folio *folio, gfp_t foo)
{
- return !PagePrivate(page);
+ return !folio_test_private(folio);
}
-static void orangefs_freepage(struct page *page)
+static void orangefs_free_folio(struct folio *folio)
{
- kfree(detach_page_private(page));
+ kfree(folio_detach_private(folio));
}
static int orangefs_launder_folio(struct folio *folio)
@@ -632,14 +630,14 @@ out:
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readahead = orangefs_readahead,
- .readpage = orangefs_readpage,
+ .read_folio = orangefs_read_folio,
.writepages = orangefs_writepages,
.dirty_folio = filemap_dirty_folio,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
.invalidate_folio = orangefs_invalidate_folio,
- .releasepage = orangefs_releasepage,
- .freepage = orangefs_freepage,
+ .release_folio = orangefs_release_folio,
+ .free_folio = orangefs_free_folio,
.launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fa125feed0ff..9d69b4dbb8c4 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
return -EPERM;
- if (flags & O_DIRECT) {
- if (!file->f_mapping->a_ops ||
- !file->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (file->f_op->check_flags) {
err = file->f_op->check_flags(flags);
@@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
@@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
diff --git a/fs/pipe.c b/fs/pipe.c
index d04c3fce28a6..74ae9fafd25a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1245,30 +1245,33 @@ unsigned int round_pipe_size(unsigned long size)
/*
* Resize the pipe ring to a number of slots.
+ *
+ * Note the pipe can be reduced in capacity, but only if the current
+ * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
+ * returned instead.
*/
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
- /*
- * We can shrink the pipe, if arg is greater than the ring occupancy.
- * Since we don't expect a lot of shrink+grow operations, just free and
- * allocate again like we would do for growing. If the pipe currently
- * contains more buffers than arg, then return busy.
- */
- mask = pipe->ring_size - 1;
- head = pipe->head;
- tail = pipe->tail;
- n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n)
- return -EBUSY;
-
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
+ spin_lock_irq(&pipe->rd_wait.lock);
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+
+ n = pipe_occupancy(head, tail);
+ if (nr_slots < n) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ kfree(bufs);
+ return -EBUSY;
+ }
+
/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indices.
@@ -1300,6 +1303,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
pipe->tail = tail;
pipe->head = head;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..8dfa36a99c74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760fd77bd..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6fa761c9cc78..6e89f0e2fd20 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ seq_printf(m, "Zswap: %8lu kB\n",
+ (unsigned long)(zswap_pool_total_size >> 10));
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 7d9cfc730bd4..021e83fe831f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,9 @@
#include <linux/kmemleak.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, table) \
+ for ((entry) = (table); (entry)->procname; (entry)++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
@@ -26,7 +29,7 @@ static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 };
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
@@ -215,15 +218,19 @@ static void init_header(struct ctl_table_header *head,
INIT_HLIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+
+ list_for_each_table_entry(entry, table) {
node->header = head;
+ node++;
+ }
}
}
static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+
+ list_for_each_table_entry(entry, head->ctl_table)
erase_entry(head, entry);
}
@@ -248,7 +255,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -978,7 +985,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
table = (struct ctl_table *)(node + 1);
new_name = (char *)(table + 2);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
init_header(&new->header, set->dir.header.root, set, node, table);
@@ -1130,35 +1136,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
static int sysctl_check_table(const char *path, struct ctl_table *table)
{
+ struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dou8vec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ err |= sysctl_err(path, entry, "Not a file");
+
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
@@ -1174,7 +1181,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
name_bytes = 0;
nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
@@ -1191,14 +1198,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
node = (struct ctl_node *)(links + 1);
link_table = (struct ctl_table *)(node + nr_entries);
link_name = (char *)&link_table[nr_entries + 1];
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, table) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
link->data = link_root;
link_name += len;
+ link++;
}
init_header(links, dir->header.root, dir->header.set, node, link_table);
links->nreg = nr_entries;
@@ -1213,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
if (!link)
@@ -1226,7 +1235,7 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
head->nreg++;
@@ -1329,11 +1338,11 @@ struct ctl_table_header *__register_sysctl_table(
struct ctl_node *node;
int nr_entries = 0;
- for (entry = table; entry->procname; entry++)
+ list_for_each_table_entry(entry, table)
nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
@@ -1456,7 +1465,7 @@ static int count_subheaders(struct ctl_table *table)
if (!table || !table->procname)
return 1;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_subheaders += count_subheaders(entry->child);
else
@@ -1475,7 +1484,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
int nr_dirs = 0;
int err = -ENOMEM;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_dirs++;
else
@@ -1492,7 +1501,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
goto out;
ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
+ new = files;
+
+ list_for_each_table_entry(entry, table) {
if (entry->child)
continue;
*new = *entry;
@@ -1516,7 +1527,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
}
/* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
char *child_pos;
if (!entry->child)
@@ -1670,7 +1681,7 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..2d04e3470d4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
+ if (pte_marker_entry_uffd_wp(entry))
+ flags |= PM_UFFD_WP;
}
if (page && !PageAnon(page))
@@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
@@ -1873,8 +1880,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
return 0;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a635bb6615e9..391ea402920d 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -245,17 +245,18 @@ static void qnx4_kill_sb(struct super_block *sb)
}
}
-static int qnx4_readpage(struct file *file, struct page *page)
+static int qnx4_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,qnx4_get_block);
+ return block_read_full_folio(folio, qnx4_get_block);
}
static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,qnx4_get_block);
}
+
static const struct address_space_operations qnx4_aops = {
- .readpage = qnx4_readpage,
+ .read_folio = qnx4_read_folio,
.bmap = qnx4_bmap
};
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 9d8e7e9788a1..b9895afca9d1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -94,9 +94,9 @@ static int qnx6_check_blockptr(__fs32 ptr)
return 1;
}
-static int qnx6_readpage(struct file *file, struct page *page)
+static int qnx6_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, qnx6_get_block);
+ return mpage_read_folio(folio, qnx6_get_block);
}
static void qnx6_readahead(struct readahead_control *rac)
@@ -496,7 +496,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, qnx6_get_block);
}
static const struct address_space_operations qnx6_aops = {
- .readpage = qnx6_readpage,
+ .read_folio = qnx6_read_folio,
.readahead = qnx6_readahead,
.bmap = qnx6_bmap
};
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 203a47232707..6e228bfbe7ef 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -227,7 +227,7 @@ drop_write_lock:
}
/*
* If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus readpage() for
+ * uptodate then we can optimize away a bogus read_folio() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 36c59b25486c..0cffe054b78e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -167,10 +167,10 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
* cutting the code is fine, since it really isn't in use yet and is easy
* to add back in. But, Vladimir has a really good idea here. Think
* about what happens for reading a file. For each page,
- * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ * The VFS layer calls reiserfs_read_folio, who searches the tree to find
* an indirect item. This indirect item has X number of pointers, where
* X is a big number if we've done the block allocation right. But,
- * we only use one or two of these pointers during each call to readpage,
+ * we only use one or two of these pointers during each call to read_folio,
* needlessly researching again later on.
*
* The size of the cache could be dynamic based on the size of the file.
@@ -966,7 +966,7 @@ research:
* it is important the set_buffer_uptodate is done
* after the direct2indirect. The buffer might
* contain valid data newer than the data on disk
- * (read by readpage, changed, and then sent here by
+ * (read by read_folio, changed, and then sent here by
* writepage). direct2indirect needs to know if unbh
* was already up to date, so it can decide if the
* data in unbh needs to be replaced with data from
@@ -2733,9 +2733,9 @@ fail:
goto done;
}
-static int reiserfs_readpage(struct file *f, struct page *page)
+static int reiserfs_read_folio(struct file *f, struct folio *folio)
{
- return block_read_full_page(page, reiserfs_get_block);
+ return block_read_full_folio(folio, reiserfs_get_block);
}
static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -2753,7 +2753,7 @@ static void reiserfs_truncate_failed_write(struct inode *inode)
static int reiserfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode;
@@ -2764,7 +2764,7 @@ static int reiserfs_write_begin(struct file *file,
inode = mapping->host;
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -3202,39 +3202,39 @@ static bool reiserfs_dirty_folio(struct address_space *mapping,
}
/*
- * Returns 1 if the page's buffers were dropped. The page is locked.
+ * Returns true if the folio's buffers were dropped. The folio is locked.
*
* Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
- * in the buffers at page_buffers(page).
+ * in the buffers at folio_buffers(folio).
*
* even in -o notail mode, we can't be sure an old mount without -o notail
* didn't create files with tails.
*/
-static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
struct buffer_head *head;
struct buffer_head *bh;
- int ret = 1;
+ bool ret = true;
- WARN_ON(PageChecked(page));
+ WARN_ON(folio_test_checked(folio));
spin_lock(&j->j_dirty_buffers_lock);
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
if (bh->b_private) {
if (!buffer_dirty(bh) && !buffer_locked(bh)) {
reiserfs_free_jh(bh);
} else {
- ret = 0;
+ ret = false;
break;
}
}
bh = bh->b_this_page;
} while (bh != head);
if (ret)
- ret = try_to_free_buffers(page);
+ ret = try_to_free_buffers(folio);
spin_unlock(&j->j_dirty_buffers_lock);
return ret;
}
@@ -3421,9 +3421,9 @@ out:
const struct address_space_operations reiserfs_address_space_operations = {
.writepage = reiserfs_writepage,
- .readpage = reiserfs_readpage,
+ .read_folio = reiserfs_read_folio,
.readahead = reiserfs_readahead,
- .releasepage = reiserfs_releasepage,
+ .release_folio = reiserfs_release_folio,
.invalidate_folio = reiserfs_invalidate_folio,
.write_begin = reiserfs_write_begin,
.write_end = reiserfs_write_end,
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b5b6f6201bed..d8cc9a366124 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -601,14 +601,14 @@ static int journal_list_still_alive(struct super_block *s,
*/
static void release_buffer_page(struct buffer_head *bh)
{
- struct page *page = bh->b_page;
- if (!page->mapping && trylock_page(page)) {
- get_page(page);
+ struct folio *folio = page_folio(bh->b_page);
+ if (!folio->mapping && folio_trylock(folio)) {
+ folio_get(folio);
put_bh(bh);
- if (!page->mapping)
- try_to_free_buffers(page);
- unlock_page(page);
- put_page(page);
+ if (!folio->mapping)
+ try_to_free_buffers(folio);
+ folio_unlock(folio);
+ folio_put(folio);
} else {
put_bh(bh);
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 9e6bbb4219de..c59b230d55b4 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -18,7 +18,7 @@
* Changed for 2.1.19 modules
* Jan 1997 Initial release
* Jun 1997 2.1.43+ changes
- * Proper page locking in readpage
+ * Proper page locking in read_folio
* Changed to work with 2.1.45+ fs
* Jul 1997 Fixed follow_link
* 2.1.47
@@ -41,7 +41,7 @@
* dentries in lookup
* clean up page flags setting
* (error, uptodate, locking) in
- * in readpage
+ * in read_folio
* use init_special_inode for
* fifos/sockets (and streamline) in
* read_inode, fix _ops table order
@@ -99,8 +99,9 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
/*
* read a page worth of data from the image
*/
-static int romfs_readpage(struct file *file, struct page *page)
+static int romfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
loff_t offset, size;
unsigned long fillsize, pos;
@@ -142,7 +143,7 @@ static int romfs_readpage(struct file *file, struct page *page)
}
static const struct address_space_operations romfs_aops = {
- .readpage = romfs_readpage
+ .read_folio = romfs_read_folio
};
/*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7ab8a58c29b6..9456a2032224 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -931,6 +931,38 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
}
EXPORT_SYMBOL(seq_list_next);
+struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos)
+{
+ struct list_head *lh;
+
+ list_for_each_rcu(lh, head)
+ if (pos-- == 0)
+ return lh;
+
+ return NULL;
+}
+EXPORT_SYMBOL(seq_list_start_rcu);
+
+struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos)
+{
+ if (!pos)
+ return head;
+
+ return seq_list_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_list_start_head_rcu);
+
+struct list_head *seq_list_next_rcu(void *v, struct list_head *head,
+ loff_t *ppos)
+{
+ struct list_head *lh;
+
+ lh = list_next_rcu((struct list_head *)v);
+ ++*ppos;
+ return lh == head ? NULL : lh;
+}
+EXPORT_SYMBOL(seq_list_next_rcu);
+
/**
* seq_hlist_start - start an iteration of a hlist
* @head: the head of the hlist
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 622c844f6d11..8879d052f96c 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,17 +86,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS) {
- bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
- } else {
- bio = bio_kmalloc(GFP_NOIO, page_count);
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = REQ_OP_READ;
- }
-
+ bio = bio_kmalloc(page_count, GFP_NOIO);
if (!bio)
return -ENOMEM;
-
+ bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ);
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
@@ -126,7 +119,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
return error;
}
@@ -190,7 +184,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
length |= data[0] << 8;
}
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
compressed = SQUASHFS_COMPRESSED(length);
length = SQUASHFS_COMPRESSED_SIZE(length);
@@ -224,7 +219,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
out:
if (res < 0) {
ERROR("Failed to read block 0x%llx: %d\n", index, res);
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 89d492916dea..a8e495d8eb86 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -444,8 +444,9 @@ static int squashfs_readpage_sparse(struct page *page, int expected)
return 0;
}
-static int squashfs_readpage(struct file *file, struct page *page)
+static int squashfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
int index = page->index >> (msblk->block_log - PAGE_SHIFT);
@@ -496,5 +497,5 @@ out:
const struct address_space_operations squashfs_aops = {
- .readpage = squashfs_readpage
+ .read_folio = squashfs_read_folio
};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 4f74abbc1a54..6d594ba2ed28 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -148,7 +148,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* squashfs provides 'backing_dev_info' in order to disable read-ahead. For
- * squashfs, I/O is not deferred, it is done immediately in readpage,
+ * squashfs, I/O is not deferred, it is done immediately in read_folio,
* which means the user would always have to wait their own I/O. So the effect
* of readahead is very weak for squashfs. squashfs_bdi_init will set
* sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 1430613183e6..2bf977a52c2c 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -30,8 +30,9 @@
#include "squashfs.h"
#include "xattr.h"
-static int squashfs_symlink_readpage(struct file *file, struct page *page)
+static int squashfs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
@@ -101,7 +102,7 @@ error_out:
const struct address_space_operations squashfs_symlink_aops = {
- .readpage = squashfs_symlink_readpage
+ .read_folio = squashfs_symlink_read_folio
};
const struct inode_operations squashfs_symlink_inode_ops = {
diff --git a/fs/super.c b/fs/super.c
index f1d4a193602d..60f57c7bc0a6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1204,7 +1204,7 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
- if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ if (bdev_stable_writes(s->s_bdev))
s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 409ab5e17803..d4ec9bb97de9 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -456,9 +456,9 @@ static int sysv_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page,get_block,wbc);
}
-static int sysv_readpage(struct file *file, struct page *page)
+static int sysv_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,get_block);
+ return block_read_full_folio(folio, get_block);
}
int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -477,12 +477,12 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
}
static int sysv_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, get_block);
if (unlikely(ret))
sysv_write_failed(mapping, pos + len);
@@ -497,7 +497,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations sysv_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = sysv_readpage,
+ .read_folio = sysv_read_folio,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
.write_end = generic_write_end,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0383fbdc95ff..04ced154960f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -31,9 +31,9 @@
* in the "sys_write -> alloc_pages -> direct reclaim path". So, in
* 'ubifs_writepage()' we are only guaranteed that the page is locked.
*
- * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * Similarly, @i_mutex is not always locked in 'ubifs_read_folio()', e.g., the
* read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
- * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
+ * ondemand_readahead -> read_folio"). In case of readahead, @I_SYNC flag is not
* set as well. However, UBIFS disables readahead.
*/
@@ -215,8 +215,7 @@ static void release_existing_page_budget(struct ubifs_info *c)
}
static int write_begin_slow(struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep,
- unsigned flags)
+ loff_t pos, unsigned len, struct page **pagep)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -244,7 +243,7 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (unlikely(!page)) {
ubifs_release_budget(c, &req);
return -ENOMEM;
@@ -419,7 +418,7 @@ static int allocate_budget(struct ubifs_info *c, struct page *page,
* without forcing write-back. The slow path does not make this assumption.
*/
static int ubifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -437,7 +436,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (unlikely(!page))
return -ENOMEM;
@@ -493,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- return write_begin_slow(mapping, pos, len, pagep, flags);
+ return write_begin_slow(mapping, pos, len, pagep);
}
/*
@@ -890,12 +889,14 @@ out_unlock:
return err;
}
-static int ubifs_readpage(struct file *file, struct page *page)
+static int ubifs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
+
if (ubifs_bulk_read(page))
return 0;
do_readpage(page);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -1483,22 +1484,22 @@ static int ubifs_migrate_page(struct address_space *mapping,
}
#endif
-static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
/*
* An attempt to release a dirty page without budgeting for it - should
* not happen.
*/
- if (PageWriteback(page))
- return 0;
- ubifs_assert(c, PagePrivate(page));
+ if (folio_test_writeback(folio))
+ return false;
+ ubifs_assert(c, folio_test_private(folio));
ubifs_assert(c, 0);
- detach_page_private(page);
- ClearPageChecked(page);
- return 1;
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
+ return true;
}
/*
@@ -1642,7 +1643,7 @@ static int ubifs_symlink_getattr(struct user_namespace *mnt_userns,
}
const struct address_space_operations ubifs_file_address_operations = {
- .readpage = ubifs_readpage,
+ .read_folio = ubifs_read_folio,
.writepage = ubifs_writepage,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
@@ -1651,7 +1652,7 @@ const struct address_space_operations ubifs_file_address_operations = {
#ifdef CONFIG_MIGRATION
.migratepage = ubifs_migrate_page,
#endif
- .releasepage = ubifs_releasepage,
+ .release_folio = ubifs_release_folio,
};
const struct inode_operations ubifs_file_inode_operations = {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index bad67455215f..0978d01b0ea4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2191,7 +2191,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
- * UBIFS, I/O is not deferred, it is done immediately in readpage,
+ * UBIFS, I/O is not deferred, it is done immediately in read_folio,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 008fa46ef61e..7d6d2f152e03 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -132,7 +132,7 @@
#define WORST_COMPR_FACTOR 2
#ifdef CONFIG_FS_ENCRYPTION
-#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
+#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT
#else
#define UBIFS_CIPHER_BLOCK_SIZE 0
#endif
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0f6bf2504437..09aef77269fe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -57,11 +57,11 @@ static void __udf_adinicb_readpage(struct page *page)
kunmap_atomic(kaddr);
}
-static int udf_adinicb_readpage(struct file *file, struct page *page)
+static int udf_adinicb_read_folio(struct file *file, struct folio *folio)
{
- BUG_ON(!PageLocked(page));
- __udf_adinicb_readpage(page);
- unlock_page(page);
+ BUG_ON(!folio_test_locked(folio));
+ __udf_adinicb_readpage(&folio->page);
+ folio_unlock(folio);
return 0;
}
@@ -87,14 +87,14 @@ static int udf_adinicb_writepage(struct page *page,
static int udf_adinicb_write_begin(struct file *file,
struct address_space *mapping, loff_t pos,
- unsigned len, unsigned flags, struct page **pagep,
+ unsigned len, struct page **pagep,
void **fsdata)
{
struct page *page;
if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ page = grab_cache_page_write_begin(mapping, 0);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -127,7 +127,7 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
const struct address_space_operations udf_adinicb_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = udf_adinicb_readpage,
+ .read_folio = udf_adinicb_read_folio,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
.write_end = udf_adinicb_write_end,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ca4fa710e562..edc88716751a 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -193,9 +193,9 @@ static int udf_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, udf_get_block);
}
-static int udf_readpage(struct file *file, struct page *page)
+static int udf_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, udf_get_block);
+ return mpage_read_folio(folio, udf_get_block);
}
static void udf_readahead(struct readahead_control *rac)
@@ -204,12 +204,12 @@ static void udf_readahead(struct readahead_control *rac)
}
static int udf_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, udf_get_block);
if (unlikely(ret))
udf_write_failed(mapping, pos + len);
return ret;
@@ -237,7 +237,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations udf_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = udf_readpage,
+ .read_folio = udf_read_folio,
.readahead = udf_readahead,
.writepage = udf_writepage,
.writepages = udf_writepages,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 0ed4861b038f..b3d5f97f16cd 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy(udf_get_fi_ident(sfi), fileident, lfi);
+ memcpy(sfi->impUse + liu, fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy(udf_get_fi_ident(sfi), fileident, -offset);
+ memcpy(sfi->impUse + liu, fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
+ memset(sfi->impUse + liu + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
+ memset(sfi->impUse + liu + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 9b223421a3c5..f3642f9c23f8 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -101,8 +101,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
return 0;
}
-static int udf_symlink_filler(struct file *file, struct page *page)
+static int udf_symlink_filler(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
@@ -183,7 +184,7 @@ static int udf_symlink_getattr(struct user_namespace *mnt_userns,
* symlinks can't do much...
*/
const struct address_space_operations udf_symlink_aops = {
- .readpage = udf_symlink_filler,
+ .read_folio = udf_symlink_filler,
};
const struct inode_operations udf_symlink_inode_operations = {
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index d0dda01620f0..a873de7dec1c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -390,7 +390,7 @@ out:
/**
* ufs_getfrag_block() - `get_block_t' function, interface between UFS and
- * readpage, writepage and so on
+ * read_folio, writepage and so on
*/
static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
@@ -472,9 +472,9 @@ static int ufs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page,ufs_getfrag_block,wbc);
}
-static int ufs_readpage(struct file *file, struct page *page)
+static int ufs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,ufs_getfrag_block);
+ return block_read_full_folio(folio, ufs_getfrag_block);
}
int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -495,13 +495,12 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
}
static int ufs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- ufs_getfrag_block);
+ ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block);
if (unlikely(ret))
ufs_write_failed(mapping, pos + len);
@@ -528,7 +527,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations ufs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = ufs_readpage,
+ .read_folio = ufs_read_folio,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
.write_end = ufs_write_end,
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aa0c47cb0d16..e943370107d0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,6 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/swapops.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (huge_pte_none(pte))
+ if (huge_pte_none_mostly(pte))
ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte = pte_offset_map(pmd, address);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (pte_none(*pte))
+ if (pte_none_mostly(*pte))
ret = true;
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -1255,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- /* FIXME: add WP support to hugetlbfs and shmem */
- if (vm_flags & VM_UFFD_WP) {
- if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
- return false;
- }
-
- if (vm_flags & VM_UFFD_MINOR) {
- if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
- return false;
- }
-
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
-}
-
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1954,6 +1939,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index d74e0d336995..572aa1c43b37 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -225,8 +225,9 @@ const struct inode_operations vboxsf_reg_iops = {
.setattr = vboxsf_setattr
};
-static int vboxsf_readpage(struct file *file, struct page *page)
+static int vboxsf_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct vboxsf_handle *sf_handle = file->private_data;
loff_t off = page_offset(page);
u32 nread = PAGE_SIZE;
@@ -352,7 +353,7 @@ out:
* page and it does not call SetPageUptodate for partial writes.
*/
const struct address_space_operations vboxsf_reg_aops = {
- .readpage = vboxsf_readpage,
+ .read_folio = vboxsf_read_folio,
.writepage = vboxsf_writepage,
.dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index 24d1b54de807..54598cd80145 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -3,6 +3,7 @@
config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
select CRYPTO
+ select CRYPTO_HASH_INFO
# SHA-256 is implied as it's intended to be the default hash algorithm.
# To avoid bloat, other wanted algorithms must be selected explicitly.
# Note that CRYPTO_SHA256 denotes the generic C implementation, but
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 60a4372aa4d7..df6b499bf6a1 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -18,27 +18,26 @@
* Read a file data page for Merkle tree construction. Do aggressive readahead,
* since we're sequentially reading the entire file.
*/
-static struct page *read_file_data_page(struct file *filp, pgoff_t index,
+static struct page *read_file_data_page(struct file *file, pgoff_t index,
struct file_ra_state *ra,
unsigned long remaining_pages)
{
- struct page *page;
+ DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index);
+ struct folio *folio;
- page = find_get_page_flags(filp->f_mapping, index, FGP_ACCESSED);
- if (!page || !PageUptodate(page)) {
- if (page)
- put_page(page);
+ folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0);
+ if (!folio || !folio_test_uptodate(folio)) {
+ if (folio)
+ folio_put(folio);
else
- page_cache_sync_readahead(filp->f_mapping, ra, filp,
- index, remaining_pages);
- page = read_mapping_page(filp->f_mapping, index, NULL);
- if (IS_ERR(page))
- return page;
+ page_cache_sync_ra(&ractl, remaining_pages);
+ folio = read_cache_folio(ractl.mapping, index, NULL, file);
+ if (IS_ERR(folio))
+ return &folio->page;
}
- if (PageReadahead(page))
- page_cache_async_readahead(filp->f_mapping, ra, filp, page,
- index, remaining_pages);
- return page;
+ if (folio_test_readahead(folio))
+ page_cache_async_ra(&ractl, folio, remaining_pages);
+ return folio_file_page(folio, index);
}
static int build_merkle_tree_level(struct file *filp, unsigned int level,
@@ -202,7 +201,7 @@ static int enable_verity(struct file *filp,
const struct fsverity_operations *vops = inode->i_sb->s_vop;
struct merkle_tree_params params = { };
struct fsverity_descriptor *desc;
- size_t desc_size = sizeof(*desc) + arg->sig_size;
+ size_t desc_size = struct_size(desc, signature, arg->sig_size);
struct fsverity_info *vi;
int err;
@@ -281,7 +280,7 @@ static int enable_verity(struct file *filp,
* from disk. This is simpler, and it serves as an extra check that the
* metadata we're writing is valid before actually enabling verity.
*/
- vi = fsverity_create_info(inode, desc, desc_size);
+ vi = fsverity_create_info(inode, desc);
if (IS_ERR(vi)) {
err = PTR_ERR(vi);
goto rollback;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index a7920434bae5..629785c95007 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -14,7 +14,6 @@
#define pr_fmt(fmt) "fs-verity: " fmt
-#include <crypto/sha2.h>
#include <linux/fsverity.h>
#include <linux/mempool.h>
@@ -26,12 +25,6 @@ struct ahash_request;
*/
#define FS_VERITY_MAX_LEVELS 8
-/*
- * Largest digest size among all hash algorithms supported by fs-verity.
- * Currently assumed to be <= size of fsverity_descriptor::root_hash.
- */
-#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
-
/* A hash algorithm supported by fs-verity */
struct fsverity_hash_alg {
struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
@@ -122,16 +115,14 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
const u8 *salt, size_t salt_size);
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- struct fsverity_descriptor *desc,
- size_t desc_size);
+ struct fsverity_descriptor *desc);
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
void fsverity_free_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
- struct fsverity_descriptor **desc_ret,
- size_t *desc_size_ret);
+ struct fsverity_descriptor **desc_ret);
int __init fsverity_init_info_cache(void);
void __init fsverity_exit_info_cache(void);
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index f0d7b30c62db..e99c00350c28 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -57,3 +57,46 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
return 0;
}
EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
+
+/**
+ * fsverity_get_digest() - get a verity file's digest
+ * @inode: inode to get digest of
+ * @digest: (out) pointer to the digest
+ * @alg: (out) pointer to the hash algorithm enumeration
+ *
+ * Return the file hash algorithm and digest of an fsverity protected file.
+ * Assumption: before calling fsverity_get_digest(), the file must have been
+ * opened.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_get_digest(struct inode *inode,
+ u8 digest[FS_VERITY_MAX_DIGEST_SIZE],
+ enum hash_algo *alg)
+{
+ const struct fsverity_info *vi;
+ const struct fsverity_hash_alg *hash_alg;
+ int i;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+
+ hash_alg = vi->tree_params.hash_alg;
+ memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
+
+ /* convert the verity hash algorithm name to a hash_algo_name enum */
+ i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
+ if (i < 0)
+ return -EINVAL;
+ *alg = i;
+
+ if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
+ return -EINVAL;
+ memcpy(digest, vi->file_digest, hash_alg->digest_size);
+
+ pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
+ hash_digest_size[*alg], digest);
+
+ return 0;
+}
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 92df87f5fa38..81ff94442f7b 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -147,8 +147,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
* fsverity_descriptor must have already undergone basic validation.
*/
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- struct fsverity_descriptor *desc,
- size_t desc_size)
+ struct fsverity_descriptor *desc)
{
struct fsverity_info *vi;
int err;
@@ -264,8 +263,7 @@ static bool validate_fsverity_descriptor(struct inode *inode,
* the filesystem, and do basic validation of it.
*/
int fsverity_get_descriptor(struct inode *inode,
- struct fsverity_descriptor **desc_ret,
- size_t *desc_size_ret)
+ struct fsverity_descriptor **desc_ret)
{
int res;
struct fsverity_descriptor *desc;
@@ -297,7 +295,6 @@ int fsverity_get_descriptor(struct inode *inode,
}
*desc_ret = desc;
- *desc_size_ret = res;
return 0;
}
@@ -306,17 +303,16 @@ static int ensure_verity_info(struct inode *inode)
{
struct fsverity_info *vi = fsverity_get_info(inode);
struct fsverity_descriptor *desc;
- size_t desc_size;
int err;
if (vi)
return 0;
- err = fsverity_get_descriptor(inode, &desc, &desc_size);
+ err = fsverity_get_descriptor(inode, &desc);
if (err)
return err;
- vi = fsverity_create_info(inode, desc, desc_size);
+ vi = fsverity_create_info(inode, desc);
if (IS_ERR(vi)) {
err = PTR_ERR(vi);
goto out_free_desc;
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 7e2d0c7bdf0d..6ee849dc7bc1 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -101,7 +101,7 @@ static int fsverity_read_descriptor(struct inode *inode,
size_t desc_size;
int res;
- res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ res = fsverity_get_descriptor(inode, &desc);
if (res)
return res;
@@ -119,10 +119,9 @@ static int fsverity_read_signature(struct inode *inode,
void __user *buf, u64 offset, int length)
{
struct fsverity_descriptor *desc;
- size_t desc_size;
int res;
- res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ res = fsverity_get_descriptor(inode, &desc);
if (res)
return res;
diff --git a/fs/xattr.c b/fs/xattr.c
index 998045165916..e8dd03e4561e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -25,6 +25,8 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
@@ -539,44 +541,76 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
/*
* Extended attribute SET operations
*/
-static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, const void __user *value, size_t size,
- int flags)
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
{
int error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
- if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
+ if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
+ error = strncpy_from_user(ctx->kname->name, name,
+ sizeof(ctx->kname->name));
+ if (error == 0 || error == sizeof(ctx->kname->name))
+ return -ERANGE;
if (error < 0)
return error;
- if (size) {
- if (size > XATTR_SIZE_MAX)
+ error = 0;
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kvmalloc(size, GFP_KERNEL);
- if (!kvalue)
- return -ENOMEM;
- if (copy_from_user(kvalue, value, size)) {
- error = -EFAULT;
- goto out;
+
+ ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
+ if (IS_ERR(ctx->kvalue)) {
+ error = PTR_ERR(ctx->kvalue);
+ ctx->kvalue = NULL;
}
- if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
- kvalue, size);
}
- error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
-out:
- kvfree(kvalue);
+ return error;
+}
+
+static void setxattr_convert(struct user_namespace *mnt_userns,
+ struct dentry *d, struct xattr_ctx *ctx)
+{
+ if (ctx->size &&
+ ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+ (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
+ posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
+ ctx->kvalue, ctx->size);
+}
+
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx)
+{
+ setxattr_convert(mnt_userns, dentry, ctx);
+ return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size, ctx->flags);
+}
+
+static long
+setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, const void __user *value, size_t size,
+ int flags)
+{
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ int error;
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ error = do_setxattr(mnt_userns, d, &ctx);
+
+ kvfree(ctx.kvalue);
return error;
}
@@ -642,44 +676,61 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
/*
* Extended attribute GET operations
*/
-static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t
+do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ struct xattr_ctx *ctx)
{
ssize_t error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
-
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
+ char *kname = ctx->kname->name;
- if (size) {
- if (size > XATTR_SIZE_MAX)
- size = XATTR_SIZE_MAX;
- kvalue = kvzalloc(size, GFP_KERNEL);
- if (!kvalue)
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
+ ctx->size = XATTR_SIZE_MAX;
+ ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!ctx->kvalue)
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
+ error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
if (error > 0) {
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
- kvalue, error);
- if (size && copy_to_user(value, kvalue, error))
+ ctx->kvalue, error);
+ if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
- } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+ } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- kvfree(kvalue);
+ return error;
+}
+
+static ssize_t
+getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, void __user *value, size_t size)
+{
+ ssize_t error;
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .value = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = 0,
+ };
+
+ error = strncpy_from_user(kname.name, name, sizeof(kname.name));
+ if (error == 0 || error == sizeof(kname.name))
+ error = -ERANGE;
+ if (error < 0)
+ return error;
+
+ error = do_getxattr(mnt_userns, d, &ctx);
+ kvfree(ctx.kvalue);
return error;
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..b056cfc6398e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,6 +102,7 @@ xfs-y += xfs_log.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
xfs_extfree_item.o \
+ xfs_attr_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_inode_item_recover.o \
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b52ed339727f..d3f2886fdc08 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
@@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist(
xfs_agblock_t bno;
__be32 *agfl_bno;
int error;
- int logflags;
+ uint32_t logflags;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
@@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist(
*/
void
xfs_alloc_log_agf(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields) /* mask of fields to be logged (XFS_AGF_...) */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte offset */
int last; /* last byte offset */
@@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist(
struct xfs_perag *pag;
__be32 *blockp;
int error;
- int logflags;
+ uint32_t logflags;
__be32 *agfl_bno;
int startoff;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d4c057b764f9..84ca09b2223f 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -121,7 +121,7 @@ void
xfs_alloc_log_agf(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields);/* mask of fields to be logged (XFS_AGF_...) */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
* Interface for inode allocation to force the pag data to be initialized.
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 23523b802539..14ae0826bc15 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,11 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_log.h"
+
+struct kmem_cache *xfs_attri_cache;
+struct kmem_cache *xfs_attrd_cache;
/*
* xfs_attr.c
@@ -53,26 +58,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
-STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac);
+static int xfs_attr_node_try_addname(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr);
STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
struct xfs_da_state **state);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp);
-STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
- struct xfs_da_state *state);
int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0))
+ if (!XFS_IFORK_Q(ip))
+ return 0;
+ if (!ip->i_afp)
+ return 0;
+ if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0)
return 0;
return 1;
}
@@ -97,6 +98,123 @@ xfs_attr_is_leaf(
return imap.br_startoff == 0 && imap.br_blockcount == 1;
}
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
/*========================================================================
* Overall external interface routines.
*========================================================================*/
@@ -166,7 +284,7 @@ xfs_attr_get(
/*
* Calculate how many blocks we need for the new attribute,
*/
-STATIC int
+int
xfs_attr_calc_size(
struct xfs_da_args *args,
int *local)
@@ -199,6 +317,33 @@ xfs_attr_calc_size(
return nblks;
}
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
STATIC int
xfs_attr_try_sf_addname(
struct xfs_inode *dp,
@@ -230,411 +375,470 @@ xfs_attr_try_sf_addname(
return error;
}
-/*
- * Check to see if the attr should be upgraded from non-existent or shortform to
- * single-leaf-block attribute list.
- */
-static inline bool
-xfs_attr_is_shortform(
- struct xfs_inode *ip)
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_item *attr)
{
- return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0);
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+ * push cannot grab the half-baked leaf buffer and run into problems
+ * with the write verifier.
+ */
+ xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
}
/*
- * Checks to see if a delayed attribute transaction should be rolled. If so,
- * transaction is finished or rolled as needed.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-STATIC int
-xfs_attr_trans_roll(
- struct xfs_delattr_context *dac)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_item *attr,
+ enum xfs_delattr_state replace_state)
{
- struct xfs_da_args *args = dac->da_args;
- int error;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
+ /*
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion. The held buffer is no longer valid
+ * after this call, regardless of the result.
+ */
+ error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
+ attr->xattri_leaf_bp = NULL;
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
- if (dac->flags & XFS_DAC_DEFER_FINISH) {
/*
- * The caller wants us to finish all the deferred ops so that we
- * avoid pinning the log tail with a large number of deferred
- * ops.
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
*/
- dac->flags &= ~XFS_DAC_DEFER_FINISH;
- error = xfs_defer_finish(&args->trans);
- } else
- error = xfs_trans_roll_inode(&args->trans, args->dp);
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
/*
- * Set the attribute specified in @args.
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
*/
-int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_item *attr)
{
- struct xfs_buf *leaf_bp = NULL;
- int error = 0;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
- do {
- error = xfs_attr_set_iter(&dac, &leaf_bp);
- if (error != -EAGAIN)
- break;
+ ASSERT(!attr->xattri_leaf_bp);
+
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error) {
- if (leaf_bp)
- xfs_trans_brelse(args->trans, leaf_bp);
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
return error;
- }
- } while (true);
+ /*
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
+ */
+ goto out;
+ }
+ if (error)
+ return error;
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
-STATIC int
-xfs_attr_sf_addname(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error = 0;
/*
- * Try to add the attr to the attribute list in the inode.
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
*/
- error = xfs_attr_try_sf_addname(dp, args);
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
+ return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
- /* Should only be 0, -EEXIST or -ENOSPC */
- if (error != -ENOSPC)
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
return error;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
/*
- * It won't fit in the shortform, transform to a leaf block. GROT:
- * another possible req'mt for a double-split btree op.
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
*/
- error = xfs_attr_shortform_to_leaf(args, leaf_bp);
- if (error)
- return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
/*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
*/
- xfs_trans_bhold(args->trans, *leaf_bp);
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
/*
- * We're still in XFS_DAS_UNINIT state here. We've converted
- * the attr fork to leaf format and will restart with the leaf
- * add.
+ * Mark the attribute as INCOMPLETE
*/
- trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
- dac->flags |= XFS_DAC_DEFER_FINISH;
- return -EAGAIN;
+ return xfs_attr3_leaf_setflag(args);
}
/*
- * Set the attribute specified in @args.
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- * returned.
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
*/
-int
-xfs_attr_set_iter(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *bp = NULL;
- int forkoff, error = 0;
-
- /* State machine switch */
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- /*
- * If the fork is shortform, attempt to add the attr. If there
- * is no space, this converts to leaf format and returns
- * -EAGAIN with the leaf buffer held across the roll. The caller
- * will deal with a transaction roll error, but otherwise
- * release the hold once we return with a clean transaction.
- */
- if (xfs_attr_is_shortform(dp))
- return xfs_attr_sf_addname(dac, leaf_bp);
- if (*leaf_bp != NULL) {
- xfs_trans_bhold_release(args->trans, *leaf_bp);
- *leaf_bp = NULL;
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state **state = &attr->xattri_da_state;
+ int error;
- if (xfs_attr_is_leaf(dp)) {
- error = xfs_attr_leaf_try_add(args, *leaf_bp);
- if (error == -ENOSPC) {
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
-
- /*
- * Finish any deferred work items and roll the
- * transaction once more. The goal here is to
- * call node_addname with the inode and
- * transaction in the same state (inode locked
- * and joined, transaction clean) no matter how
- * we got to this step.
- *
- * At this point, we are still in
- * XFS_DAS_UNINIT, but when we come back, we'll
- * be a node, so we'll fall down into the node
- * handling code below
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- } else if (error) {
- return error;
- }
+ error = xfs_attr_node_hasname(args, state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
- dac->dela_state = XFS_DAS_FOUND_LBLK;
- } else {
- error = xfs_attr_node_addname_find_attr(dac);
- if (error)
- return error;
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_node_addname(dac);
- if (error)
- return error;
+ error = xfs_attr_leaf_mark_incomplete(args, *state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error)
+ xfs_da_state_free(*state);
- dac->dela_state = XFS_DAS_FOUND_NBLK;
- }
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FOUND_LBLK:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
+ return error;
+}
- /* Open coded xfs_attr_rmtval_set without trans handling */
- if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
- dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
- }
- }
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
- /*
- * Repeat allocating remote blocks for the attr value until
- * blkcnt drops to zero.
- */
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(dac->dela_state,
- args->dp);
- return -EAGAIN;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
+ xfs_attr3_leaf_remove(bp, args);
- /*
- * If this is not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- return error;
- }
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
- /*
- * Commit the flag value change and start the next trans in
- * series.
- */
- dac->dela_state = XFS_DAS_FLIP_LFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FLIP_LFLAG:
- /*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
+ return error;
+}
- fallthrough;
- case XFS_DAS_RM_LBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_LBLK;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- if (error)
- return error;
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
- dac->dela_state = XFS_DAS_RD_LEAF;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
- fallthrough;
- case XFS_DAS_RD_LEAF:
- /*
- * This is the last step for leaf format. Read the block with
- * the old attr, remove the old attr, check for shortform
- * conversion and return.
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
- xfs_attr3_leaf_remove(bp, args);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff)
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
+ return error;
+}
- return error;
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
- case XFS_DAS_FOUND_NBLK:
- /*
- * Find space for remote blocks and fall into the allocation
- * state.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
}
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
fallthrough;
- case XFS_DAS_ALLOC_NODE:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- dac->dela_state = XFS_DAS_ALLOC_NODE;
- if (args->rmtblkno > 0) {
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
- }
- /*
- * If this was not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- goto out;
- }
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
/*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
*/
error = xfs_attr3_leaf_flipflags(args);
if (error)
- goto out;
+ return error;
/*
- * Commit the flag value change and start the next trans in
- * series
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
*/
- dac->dela_state = XFS_DAS_FLIP_NFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ break;
- case XFS_DAS_FLIP_NFLAG:
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
/*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
*/
xfs_attr_restore_rmt_blk(args);
-
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
-
- fallthrough;
- case XFS_DAS_RM_NBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_NBLK;
if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
-
+ error = xfs_attr_rmtval_invalidate(args);
if (error)
return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
+
+ attr->xattri_dela_state++;
+ goto next_state;
- dac->dela_state = XFS_DAS_CLR_FLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
}
+ if (error)
+ return error;
- fallthrough;
- case XFS_DAS_CLR_FLAG:
/*
- * The last state for node format. Look up the old attr and
- * remove it.
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
*/
- error = xfs_attr_node_addname_clear_incomplete(dac);
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
default:
ASSERT(0);
break;
}
-out:
+
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
return error;
}
@@ -668,30 +872,79 @@ xfs_attr_lookup(
return xfs_attr_node_hasname(args, NULL);
}
-/*
- * Remove the attribute specified in @args.
- */
-int
-xfs_attr_remove_args(
+static int
+xfs_attr_item_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_item **attr) /* new xfs_attr_item */
+{
+
+ struct xfs_attr_item *new;
+
+ new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
+
+ *attr = new;
+ return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
struct xfs_da_args *args)
{
- int error;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_attr_item *new;
+ int error = 0;
- do {
- error = xfs_attr_remove_iter(&dac);
- if (error != -EAGAIN)
- break;
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error)
- return error;
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
- } while (true);
+ return 0;
+}
- return error;
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_item *new;
+ int error = 0;
+
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_item *new;
+ int error;
+
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+ return 0;
}
/*
@@ -709,6 +962,7 @@ xfs_attr_set(
int error, local;
int rmt_blks = 0;
unsigned int total;
+ int delayed = xfs_has_larp(mp);
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
@@ -730,8 +984,6 @@ xfs_attr_set(
if (args->value) {
XFS_STATS_INC(mp, xs_attr_set);
-
- args->op_flags |= XFS_DA_OP_ADDNAME;
args->total = xfs_attr_calc_size(args, &local);
/*
@@ -748,61 +1000,68 @@ xfs_attr_set(
return error;
}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres *
- args->total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- total = args->total;
-
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
-
- tres = M_RES(mp)->tr_attrrm;
- total = XFS_ATTRRM_SPACE_RES(mp);
rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
+ if (delayed) {
+ error = xfs_attr_use_log_assist(mp);
+ if (error)
+ return error;
+ }
+
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
+ xfs_init_attr_trans(args, &tres, &total);
error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
- return error;
+ goto drop_incompat;
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
error = xfs_attr_lookup(args);
- if (args->value) {
- if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
- goto out_trans_cancel;
- if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_trans_cancel;
- if (error != -ENOATTR && error != -EEXIST)
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
- error = xfs_attr_set_args(args);
- if (error)
- goto out_trans_cancel;
- /* shortform attribute has already been committed */
- if (!args->trans)
- goto out_unlock;
- } else {
- if (error != -EEXIST)
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
goto out_trans_cancel;
- error = xfs_attr_remove_args(args);
- if (error)
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
}
+ if (error)
+ goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -821,6 +1080,9 @@ xfs_attr_set(
error = xfs_trans_commit(args->trans);
out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
+drop_incompat:
+ if (delayed)
+ xlog_drop_incompat_feat(mp->m_log);
return error;
out_trans_cancel:
@@ -829,6 +1091,40 @@ out_trans_cancel:
goto out_unlock;
}
+int __init
+xfs_attri_init_cache(void)
+{
+ xfs_attri_cache = kmem_cache_create("xfs_attri",
+ sizeof(struct xfs_attri_log_item),
+ 0, 0, NULL);
+
+ return xfs_attri_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attri_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attri_cache);
+ xfs_attri_cache = NULL;
+}
+
+int __init
+xfs_attrd_init_cache(void)
+{
+ xfs_attrd_cache = kmem_cache_create("xfs_attrd",
+ sizeof(struct xfs_attrd_log_item),
+ 0, 0, NULL);
+
+ return xfs_attrd_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attrd_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attrd_cache);
+ xfs_attrd_cache = NULL;
+}
+
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -845,28 +1141,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
* Add a name to the shortform attribute list structure
* This is the external routine.
*/
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
{
- int newsize, forkoff, retval;
+ int newsize, forkoff;
+ int error;
trace_xfs_attr_sf_addname(args);
- retval = xfs_attr_shortform_lookup(args);
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- return retval;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
- return retval;
- retval = xfs_attr_sf_removename(args);
- if (retval)
- return retval;
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
/*
- * Since we have removed the old attr, clear ATTR_REPLACE so
- * that the leaf format add routine won't trip over the attr
- * not being around.
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
*/
- args->attr_flags &= ~XATTR_REPLACE;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -889,8 +1198,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* External routines when attribute list is one block
*========================================================================*/
-/* Store info about a remote block */
-STATIC void
+/* Save the current remote block info and clear the current pointers. */
+static void
xfs_attr_save_rmt_blk(
struct xfs_da_args *args)
{
@@ -899,10 +1208,13 @@ xfs_attr_save_rmt_blk(
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/* Set stored info about a remote block */
-STATIC void
+static void
xfs_attr_restore_rmt_blk(
struct xfs_da_args *args)
{
@@ -928,45 +1240,54 @@ xfs_attr_leaf_try_add(
struct xfs_da_args *args,
struct xfs_buf *bp)
{
- int retval;
+ int error;
/*
- * Look up the given attribute in the leaf block. Figure out if
- * the given flags produce an error or call for an atomic rename.
+ * If the caller provided a buffer to us, it is locked and held in
+ * the transaction because it just did a shortform to leaf conversion.
+ * Hence we don't need to read it again. Otherwise read in the leaf
+ * buffer.
*/
- retval = xfs_attr_leaf_hasname(args, &bp);
- if (retval != -ENOATTR && retval != -EEXIST)
- return retval;
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_brelse;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ if (bp) {
+ xfs_trans_bhold_release(args->trans, bp);
+ } else {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Look up the xattr name to set the insertion point for the new xattr.
+ */
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto out_brelse;
trace_xfs_attr_leaf_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- xfs_attr_save_rmt_blk(args);
-
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
}
- /*
- * Add the attribute to the leaf block
- */
return xfs_attr3_leaf_add(bp, args);
out_brelse:
xfs_trans_brelse(args->trans, bp);
- return retval;
+ return error;
}
/*
@@ -1012,9 +1333,10 @@ xfs_attr_leaf_removename(
dp = args->dp;
error = xfs_attr_leaf_hasname(args, &bp);
-
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
return error;
} else if (error != -EEXIST)
return error;
@@ -1098,46 +1420,45 @@ xfs_attr_node_hasname(
STATIC int
xfs_attr_node_addname_find_attr(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- int retval;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- retval = xfs_attr_node_hasname(args, &dac->da_state);
- if (retval != -ENOATTR && retval != -EEXIST)
- goto error;
-
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto error;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ error = xfs_attr_node_hasname(args, &attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto error;
- trace_xfs_attr_node_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- xfs_attr_save_rmt_blk(args);
+ trace_xfs_attr_node_replace(args);
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
}
return 0;
error:
- if (dac->da_state)
- xfs_da_state_free(dac->da_state);
- return retval;
+ if (attr->xattri_da_state)
+ xfs_da_state_free(attr->xattri_da_state);
+ return error;
}
/*
@@ -1146,21 +1467,13 @@ error:
* This will involve walking down the Btree, and may involve splitting
* leaf nodes and even splitting intermediate nodes up to and including
* the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- *
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- *returned.
*/
-STATIC int
-xfs_attr_node_addname(
- struct xfs_delattr_context *dac)
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = attr->xattri_da_state;
struct xfs_da_state_blk *blk;
int error;
@@ -1175,25 +1488,9 @@ xfs_attr_node_addname(
/*
* Its really a single leaf node, but it had
* out-of-line values so it looked like it *might*
- * have been a b-tree.
+ * have been a b-tree. Let the caller deal with this.
*/
- xfs_da_state_free(state);
- state = NULL;
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- goto out;
-
- /*
- * Now that we have converted the leaf to a node, we can
- * roll the transaction, and try xfs_attr3_leaf_add
- * again on re-entry. No need to set dela_state to do
- * this. dela_state is still unset by this function at
- * this point.
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_node_addname_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
+ goto out;
}
/*
@@ -1205,7 +1502,6 @@ xfs_attr_node_addname(
error = xfs_da3_split(state);
if (error)
goto out;
- dac->flags |= XFS_DAC_DEFER_FINISH;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1214,24 +1510,42 @@ xfs_attr_node_addname(
}
out:
- if (state)
- xfs_da_state_free(state);
+ xfs_da_state_free(state);
return error;
}
+static int
+xfs_attr_node_removename(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *blk;
+ int retval;
-STATIC int
-xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac)
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
+
+ return retval;
+}
+
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_da_state *state = NULL;
int retval = 0;
int error = 0;
/*
- * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
- * flag means that we will find the "old" attr, not the "new" one.
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
*/
args->attr_filter |= XFS_ATTR_INCOMPLETE;
state = xfs_da_state_alloc(args);
@@ -1261,362 +1575,6 @@ out:
}
/*
- * Shrink an attribute from leaf to shortform
- */
-STATIC int
-xfs_attr_node_shrink(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_inode *dp = args->dp;
- int error, forkoff;
- struct xfs_buf *bp;
-
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
-
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- } else
- xfs_trans_brelse(args->trans, bp);
-
- return error;
-}
-
-/*
- * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
- * for later deletion of the entry.
- */
-STATIC int
-xfs_attr_leaf_mark_incomplete(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- int error;
-
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- return error;
-
- /*
- * Mark the attribute as INCOMPLETE
- */
- return xfs_attr3_leaf_setflag(args);
-}
-
-/*
- * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
- * the blocks are valid. Attr keys with remote blocks will be marked
- * incomplete.
- */
-STATIC
-int xfs_attr_node_removename_setup(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state **state = &dac->da_state;
- int error;
-
- error = xfs_attr_node_hasname(args, state);
- if (error != -EEXIST)
- goto out;
- error = 0;
-
- ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
- ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
- XFS_ATTR_LEAF_MAGIC);
-
- if (args->rmtblkno > 0) {
- error = xfs_attr_leaf_mark_incomplete(args, *state);
- if (error)
- goto out;
-
- error = xfs_attr_rmtval_invalidate(args);
- }
-out:
- if (error)
- xfs_da_state_free(*state);
-
- return error;
-}
-
-STATIC int
-xfs_attr_node_removename(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_da_state_blk *blk;
- int retval;
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[state->path.active-1];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
-
- return retval;
-}
-
-/*
- * Remove the attribute specified in @args.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * This routine is meant to function as either an in-line or delayed operation,
- * and may return -EAGAIN when the transaction needs to be rolled. Calling
- * functions will need to handle this, and call the function until a
- * successful error code is returned.
- */
-int
-xfs_attr_remove_iter(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
- int retval, error = 0;
- struct xfs_inode *dp = args->dp;
-
- trace_xfs_attr_node_removename(args);
-
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- if (!xfs_inode_hasattr(dp))
- return -ENOATTR;
-
- /*
- * Shortform or leaf formats don't require transaction rolls and
- * thus state transitions. Call the right helper and return.
- */
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_removename(args);
-
- if (xfs_attr_is_leaf(dp))
- return xfs_attr_leaf_removename(args);
-
- /*
- * Node format may require transaction rolls. Set up the
- * state context and fall into the state machine.
- */
- if (!dac->da_state) {
- error = xfs_attr_node_removename_setup(dac);
- if (error)
- return error;
- state = dac->da_state;
- }
-
- fallthrough;
- case XFS_DAS_RMTBLK:
- dac->dela_state = XFS_DAS_RMTBLK;
-
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a
- * deadlock.
- */
- if (args->rmtblkno > 0) {
- /*
- * May return -EAGAIN. Roll and repeat until all remote
- * blocks are removed.
- */
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN) {
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return error;
- } else if (error) {
- goto out;
- }
-
- /*
- * Refill the state structure with buffers (the prior
- * calls released our buffers) and close out this
- * transaction before proceeding.
- */
- ASSERT(args->rmtblkno == 0);
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- dac->dela_state = XFS_DAS_RM_NAME;
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_NAME:
- /*
- * If we came here fresh from a transaction roll, reattach all
- * the buffers to the current transaction.
- */
- if (dac->dela_state == XFS_DAS_RM_NAME) {
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- }
-
- retval = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed. If so, roll
- * the transacton and fall into the shrink state.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
-
- dac->flags |= XFS_DAC_DEFER_FINISH;
- dac->dela_state = XFS_DAS_RM_SHRINK;
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_SHRINK:
- /*
- * If the result is small enough, push it all into the inode.
- * This is our final state so it's safe to return a dirty
- * transaction.
- */
- if (xfs_attr_is_leaf(dp))
- error = xfs_attr_node_shrink(args, state);
- ASSERT(error != -EAGAIN);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
- goto out;
- }
-out:
- if (state)
- xfs_da_state_free(state);
- return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level;
-
- trace_xfs_attr_fillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level, error;
-
- trace_xfs_attr_refillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- return 0;
-}
-
-/*
* Retrieve the attribute data from a node attribute list.
*
* This routine gets called for any attribute fork that has more than one
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 5e71f719bdd5..1af7abe29eef 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -28,6 +28,15 @@ struct xfs_attr_list_context;
*/
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
+static inline bool xfs_has_larp(struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ return xfs_globals.larp;
+#else
+ return false;
+#endif
+}
+
/*
* Kernel-internal version of the attrlist cursor.
*/
@@ -425,7 +434,7 @@ struct xfs_attr_list_context {
*/
/*
- * Enum values for xfs_delattr_context.da_state
+ * Enum values for xfs_attr_item.xattri_da_state
*
* These values are used by delayed attribute operations to keep track of where
* they were before they returned -EAGAIN. A return code of -EAGAIN signals the
@@ -434,46 +443,105 @@ struct xfs_attr_list_context {
* to where it was and resume executing where it left off.
*/
enum xfs_delattr_state {
- XFS_DAS_UNINIT = 0, /* No state has been set yet */
- XFS_DAS_RMTBLK, /* Removing remote blks */
- XFS_DAS_RM_NAME, /* Remove attr name */
- XFS_DAS_RM_SHRINK, /* We are shrinking the tree */
- XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */
- XFS_DAS_FOUND_NBLK, /* We found node blk for attr */
- XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */
- XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */
- XFS_DAS_RD_LEAF, /* Read in the new leaf */
- XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */
- XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */
- XFS_DAS_RM_NBLK, /* A rename is removing node blocks */
- XFS_DAS_CLR_FLAG, /* Clear incomplete flag */
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
};
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
/*
- * Defines for xfs_delattr_context.flags
+ * Defines for xfs_attr_item.xattri_flags
*/
-#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */
-#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/
+#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/
/*
* Context used for keeping track of delayed attribute operations
*/
-struct xfs_delattr_context {
- struct xfs_da_args *da_args;
+struct xfs_attr_item {
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
+ */
+ struct xfs_buf *xattri_leaf_bp;
/* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- int blkcnt;
+ struct xfs_bmbt_irec xattri_map;
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
/* Used in xfs_attr_node_removename to roll through removing blocks */
- struct xfs_da_state *da_state;
+ struct xfs_da_state *xattri_da_state;
/* Used to keep track of current state of delayed operation */
- unsigned int flags;
- enum xfs_delattr_state dela_state;
+ unsigned int xattri_flags;
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTR_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
};
+
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -489,11 +557,81 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
+int xfs_attr_set_iter(struct xfs_attr_item *attr);
+int xfs_attr_remove_iter(struct xfs_attr_item *attr);
bool xfs_attr_namecheck(const void *name, size_t length);
-void xfs_delattr_context_init(struct xfs_delattr_context *dac,
- struct xfs_da_args *args);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+extern struct kmem_cache *xfs_attri_cache;
+extern struct kmem_cache *xfs_attrd_cache;
+
+int __init xfs_attri_init_cache(void);
+void xfs_attri_destroy_cache(void);
+int __init xfs_attrd_init_cache(void);
+void xfs_attrd_destroy_cache(void);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_afp is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!args->dp->i_afp)
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (xfs_has_larp(args->dp->i_mount))
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 014daa8c542d..15a990409463 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -28,6 +28,7 @@
#include "xfs_dir2.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_errortag.h"
/*
@@ -310,6 +311,15 @@ xfs_attr3_leaf_verify(
return fa;
/*
+ * Empty leaf blocks should never occur; they imply the existence of a
+ * software bug that needs fixing. xfs_repair also flags them as a
+ * corruption that needs fixing, so we should never let these go to
+ * disk.
+ */
+ if (ichdr.count == 0)
+ return __this_address;
+
+ /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -445,6 +455,14 @@ xfs_attr3_leaf_read(
* Namespace helper routines
*========================================================================*/
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
static bool
xfs_attr_match(
struct xfs_da_args *args,
@@ -452,14 +470,18 @@ xfs_attr_match(
unsigned char *name,
int flags)
{
+
if (args->namelen != namelen)
return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /*
- * If we are looking for incomplete entries, show only those, else only
- * show complete entries.
- */
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
if (args->attr_filter !=
(flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
return false;
@@ -798,6 +820,14 @@ xfs_attr_sf_removename(
sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
if (error != -EEXIST)
return error;
size = xfs_attr_sf_entsize(sfe);
@@ -818,7 +848,7 @@ xfs_attr_sf_removename(
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(xfs_has_attr2(dp->i_mount));
- ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_remove(dp, args->trans);
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
goto out;
}
@@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
@@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work(
entry->flags = args->attr_filter;
if (tmp)
entry->flags |= XFS_ATTR_LOCAL;
- if (args->op_flags & XFS_DA_OP_RENAME) {
- entry->flags |= XFS_ATTR_INCOMPLETE;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!xfs_has_larp(mp))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 83b95be9ded8..4250159ecced 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -568,14 +568,14 @@ xfs_attr_rmtval_stale(
*/
int
xfs_attr_rmtval_find_space(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int error;
- dac->lblkno = 0;
- dac->blkcnt = 0;
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
args->rmtblkcnt = 0;
args->rmtblkno = 0;
memset(map, 0, sizeof(struct xfs_bmbt_irec));
@@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space(
if (error)
return error;
- dac->blkcnt = args->rmtblkcnt;
- dac->lblkno = args->rmtblkno;
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
return 0;
}
@@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space(
*/
int
xfs_attr_rmtval_set_blk(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_inode *dp = args->dp;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int nmap;
int error;
nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
- dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
map, &nmap);
if (error)
return error;
@@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk(
(map->br_startblock != HOLESTARTBLOCK));
/* roll attribute extent map forwards */
- dac->lblkno += map->br_blockcount;
- dac->blkcnt -= map->br_blockcount;
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
return 0;
}
@@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate(
*/
int
xfs_attr_rmtval_remove(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error, done;
/*
@@ -695,8 +696,8 @@ xfs_attr_rmtval_remove(
* the parent
*/
if (!done) {
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
return -EAGAIN;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d72eff30ca18..62b398edec3f 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_attr_item *attr);
int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
-int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
-int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74198dd82b03..6833110d1bd4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum number of
- * leaf entries, is controlled by the size of the on-disk extent count,
- * either a signed 32-bit number for the data fork, or a signed 16-bit
- * number for the attr fork.
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
*
* Note that we can no longer assume that if we are in ATTR1 that the
* fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
* ATTR2 we have to assume the worst case scenario of a minimum size
* available.
*/
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
+ else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
+
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ error0:
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
xfs_bmap_validate_ret(
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_bmbt_irec_t *mval,
int nmap,
int ret_nmap)
@@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec old;
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
int *logflagsp,
- int flags)
+ uint32_t flags)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
@@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(
/*
* For large extent hint sizes, the aligned extent might be larger than
- * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
- * the length back under MAXEXTLEN. The outer allocation loops handle
- * short allocation just fine, so it is safe to do this. We only want to
- * do it when we are forced to, though, because it means more allocation
- * operations are required.
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
*/
- while (align_alen > MAXEXTLEN)
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
- ASSERT(align_alen <= MAXEXTLEN);
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
/*
* If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- /* see MAXEXTLEN handling above */
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
- align_alen + extsz > MAXEXTLEN);
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}
#ifdef DEBUG
@@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int n,
- int flags)
+ uint32_t flags)
{
if ((flags & XFS_BMAPI_ENTIRE) ||
got->br_startoff + got->br_blockcount <= obno) {
@@ -3811,7 +3809,7 @@ xfs_bmapi_update_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int *n,
- int flags)
+ uint32_t flags)
{
xfs_bmbt_irec_t *mval = *map;
@@ -3864,7 +3862,7 @@ xfs_bmapi_read(
xfs_filblks_t len,
struct xfs_bmbt_irec *mval,
int *nmap,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
- alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
- bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
@@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
struct xfs_bmbt_irec *mval,
xfs_filblks_t len,
- int flags)
+ uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -4312,7 +4310,7 @@ xfs_bmapi_write(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting file offs. mapped */
xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
+ uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
- if (len > (xfs_filblks_t)MAXEXTLEN)
- bma.length = MAXEXTLEN;
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;
@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_iext_count_may_overflow(ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
@@ -4629,7 +4630,7 @@ xfs_bmapi_remap(
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
- ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
@@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay(
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
int error = 0;
bool isrt;
@@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
- int state = BMAP_COWFORK;
+ uint32_t state = BMAP_COWFORK;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
- int bflags) /* bmapi flags */
+ uint32_t bflags) /* bmapi flags */
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
@@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
mp = ip->i_mount;
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
* Deleting the middle of the extent.
*/
- /*
- * For directories, -ENOSPC is returned since a directory entry
- * remove operation must not fail due to low extent count
- * availability. -ENOSPC will be handled by higher layers of XFS
- * by letting the corresponding empty Data/Free blocks to linger
- * until a future remove operation. Dabtree blocks would be
- * swapped with the last block in the leaf space and then the
- * new last block will be unmapped.
- *
- * The above logic also applies to the source directory entry of
- * a rename operation.
- */
- error = xfs_iext_count_may_overflow(ip, whichfork, 1);
- if (error) {
- ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
- whichfork == XFS_DATA_FORK);
- error = -ENOSPC;
- goto done;
- }
-
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5281,7 +5262,7 @@ __xfs_bunmapi(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
- int flags, /* misc flags */
+ uint32_t flags, /* misc flags */
xfs_extnum_t nexts) /* number of extents max */
{
struct xfs_btree_cur *cur; /* bmap btree cursor */
@@ -5299,7 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
- xfs_fileoff_t max_len;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5318,16 +5298,6 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- /*
- * Guesstimate how many blocks we can unmap without running the risk of
- * blowing out the transaction with a mix of EFIs and reflink
- * adjustments.
- */
- if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
- max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
- else
- max_len = len;
-
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -5366,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
- (nexts == 0 || extno < nexts) && max_len > 0) {
+ (nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
@@ -5400,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- /* How much can we safely unmap? */
- if (max_len < del.br_blockcount) {
- del.br_startoff += del.br_blockcount - max_len;
- if (!wasdel)
- del.br_startblock += del.br_blockcount - max_len;
- del.br_blockcount = max_len;
- }
-
if (!isrt)
goto delete;
@@ -5543,7 +5505,6 @@ delete:
if (error)
goto error0;
- max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
@@ -5609,7 +5570,7 @@ xfs_bunmapi(
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_extnum_t nexts,
int *done)
{
@@ -5641,7 +5602,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;
return true;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 03d9aaf87413..16db95b11589 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -39,7 +39,7 @@ struct xfs_bmalloca {
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
int datatype;/* data type being allocated */
- int flags;
+ uint32_t flags;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -47,17 +47,17 @@ struct xfs_bmalloca {
/*
* Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_CONVERT (1u << 5)
/*
* allocate zeroed extents - this requires all newly allocated user data extents
@@ -65,7 +65,7 @@ struct xfs_bmalloca {
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
-#define XFS_BMAPI_ZERO 0x080
+#define XFS_BMAPI_ZERO (1u << 6)
/*
* Map the inode offset to the block given in ap->firstblock. Primarily
@@ -75,16 +75,16 @@ struct xfs_bmalloca {
* For bunmapi, this flag unmaps the range without adjusting quota, reducing
* refcount, or freeing the blocks.
*/
-#define XFS_BMAPI_REMAP 0x100
+#define XFS_BMAPI_REMAP (1u << 7)
/* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK 0x200
+#define XFS_BMAPI_COWFORK (1u << 8)
/* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD 0x1000
+#define XFS_BMAPI_NODISCARD (1u << 9)
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_NORMAP (1u << 10)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
(w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
}
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{
if (bmapi_flags & XFS_BMAPI_COWFORK)
return XFS_COW_FORK;
@@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
* Flags for xfs_bmap_add_extent*.
*/
-#define BMAP_LEFT_CONTIG (1 << 0)
-#define BMAP_RIGHT_CONTIG (1 << 1)
-#define BMAP_LEFT_FILLING (1 << 2)
-#define BMAP_RIGHT_FILLING (1 << 3)
-#define BMAP_LEFT_DELAY (1 << 4)
-#define BMAP_RIGHT_DELAY (1 << 5)
-#define BMAP_LEFT_VALID (1 << 6)
-#define BMAP_RIGHT_VALID (1 << 7)
-#define BMAP_ATTRFORK (1 << 8)
-#define BMAP_COWFORK (1 << 9)
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
@@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
+ int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
switch (whichfork) {
case XFS_ATTR_FORK:
@@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
- int flags);
+ uint32_t flags);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 453309fc85f2..2b77d45c215f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
-/* Compute the max possible height for block mapping btrees. */
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
/* One extra level for the inode root. */
- return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c1500b238520..2aa300f7461f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -51,6 +51,52 @@ xfs_btree_magic(
return magic;
}
+static xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ xfs_fsblock_t sibling)
+{
+ if (sibling == NULLFSBLOCK)
+ return NULL;
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t sibling)
+{
+ if (sibling == NULLAGBLOCK)
+ return NULL;
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(mp, agno, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
/*
* Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok.
@@ -65,6 +111,8 @@ __xfs_btree_check_lblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -83,16 +131,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ be64_to_cpu(block->bb_u.l.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ be64_to_cpu(block->bb_u.l.bb_rightsib));
+ return fa;
}
/* Check a long btree block header. */
@@ -130,6 +178,9 @@ __xfs_btree_check_sblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agnumber_t agno = NULLAGNUMBER;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -146,16 +197,18 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp) {
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
+ }
+
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
+ agbno, be32_to_cpu(block->bb_u.s.bb_rightsib));
+ return fa;
}
/* Check a short btree block header. */
@@ -751,20 +804,20 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- int64_t imask; /* mask for current bit number */
+ uint32_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
* Find the lowest bit, so the first byte offset.
*/
- for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
if (imask & fields) {
*first = offsets[i];
break;
@@ -773,7 +826,7 @@ xfs_btree_offsets(
/*
* Find the highest bit, so the last byte offset.
*/
- for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
if (imask & fields) {
*last = offsets[i + 1] - 1;
break;
@@ -1456,7 +1509,7 @@ void
xfs_btree_log_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_buf *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
{
int first; /* first byte offset logged */
int last; /* last byte offset logged */
@@ -4271,6 +4324,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT;
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4445,20 +4513,21 @@ xfs_btree_lblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ be64_to_cpu(block->bb_u.l.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ be64_to_cpu(block->bb_u.l.bb_rightsib));
+ return fa;
}
/**
@@ -4499,7 +4568,9 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agblock_t agno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
@@ -4507,14 +4578,13 @@ xfs_btree_sblock_verify(
/* sibling pointer verification */
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_rightsib));
+ return fa;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 22d9f411fde6..eef27858a013 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC (1 << 0)
-#define XFS_BB_LEVEL (1 << 1)
-#define XFS_BB_NUMRECS (1 << 2)
-#define XFS_BB_LEFTSIB (1 << 3)
-#define XFS_BB_RIGHTSIB (1 << 4)
-#define XFS_BB_BLKNO (1 << 5)
-#define XFS_BB_LSN (1 << 6)
-#define XFS_BB_UUID (1 << 7)
-#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
#define XFS_BB_NUM_BITS_CRC 9
-#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -345,7 +345,7 @@ xfs_btree_dup_cursor(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9dc1ecb9713d..aa74f3fdb571 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
/*
* xfs_da_btree.c
@@ -482,6 +483,9 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0faf7d9ac241..ed2303e4d46a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
@@ -76,27 +77,31 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- int op_flags; /* operation flags */
+ uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
* Operation flags:
*/
-#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
-#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
-#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
- { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -197,7 +202,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
* Utility routines.
*/
-#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5a49caa5c9df..25e2841084e1 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
+#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -688,10 +689,10 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
/*
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 0805ade2d300..ceb222b4f261 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -22,6 +22,10 @@
#include "xfs_refcount.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -184,9 +188,10 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
};
-static void
+static bool
xfs_defer_create_intent(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp,
@@ -197,6 +202,7 @@ xfs_defer_create_intent(
if (!dfp->dfp_intent)
dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
dfp->dfp_count, sort);
+ return dfp->dfp_intent != NULL;
}
/*
@@ -204,16 +210,18 @@ xfs_defer_create_intent(
* associated extents, then add the entire intake list to the end of
* the pending list.
*/
-STATIC void
+static bool
xfs_defer_create_intents(
struct xfs_trans *tp)
{
struct xfs_defer_pending *dfp;
+ bool ret = false;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- xfs_defer_create_intent(tp, dfp, true);
+ ret |= xfs_defer_create_intent(tp, dfp, true);
}
+ return ret;
}
/* Abort all the intents that were committed. */
@@ -487,7 +495,7 @@ int
xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
@@ -506,17 +514,20 @@ xfs_defer_finish_noroll(
* of time that any one intent item can stick around in memory,
* pinning the log tail.
*/
- xfs_defer_create_intents(*tp);
+ bool has_intents = xfs_defer_create_intents(*tp);
+
list_splice_init(&(*tp)->t_dfops, &dop_pending);
- error = xfs_defer_trans_roll(tp);
- if (error)
- goto out_shutdown;
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
- /* Possibly relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
- goto out_shutdown;
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+ }
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
@@ -774,17 +785,25 @@ xfs_defer_ops_continue(
struct xfs_trans *tp,
struct xfs_defer_resources *dres)
{
+ unsigned int i;
+
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
- /* Lock and join the captured inode to the new transaction. */
+ /* Lock the captured resources to the new transaction. */
if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
xfs_defer_restore_resources(tp, &dfc->dfc_held);
memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -854,7 +873,12 @@ xfs_defer_init_item_caches(void)
error = xfs_extfree_intent_init_cache();
if (error)
goto err;
-
+ error = xfs_attri_init_cache();
+ if (error)
+ goto err;
+ error = xfs_attrd_init_cache();
+ if (error)
+ goto err;
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -865,6 +889,8 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_attri_destroy_cache();
+ xfs_attrd_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
xfs_refcount_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7bb8a31ad65b..114a3a4930a3 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -19,6 +19,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
/*
* Deferred operation item relogging limits.
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 5f1e4799e8fa..3cd51fa3837b 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a23a52e643ad..5362908164b0 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -59,7 +59,10 @@
#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
#define XFS_ERRTAG_AG_RESV_FAIL 38
-#define XFS_ERRTAG_MAX 39
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -103,5 +106,8 @@
#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d665c04e69dd..afdfc8108c5f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
XFS_SB_FEAT_INCOMPAT_META_UUID| \
XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature(
return (sbp->sb_features_incompat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
@@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -525,26 +534,26 @@ typedef struct xfs_agf {
#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define XFS_AGF_REFCOUNT_ROOT 0x00008000
-#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
-#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
#define XFS_AGF_NUM_BITS 18
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
{ XFS_AGF_MAGICNUM, "MAGICNUM" }, \
@@ -619,22 +628,22 @@ typedef struct xfs_agi {
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
@@ -791,16 +800,41 @@ struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
@@ -870,6 +904,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
@@ -918,10 +1002,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
@@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
-#define XFS_DQTYPE_USER 0x01 /* user dquot record */
-#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
-#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
-#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
@@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 505533c43a92..1cfd5bc6520a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
@@ -377,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
- uint32_t bs_extents; /* number of extents */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
@@ -386,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
- uint64_t bs_pad[7]; /* zeroed */
+ uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
-#define XFS_BULK_IREQ_AGNO (1 << 0)
+#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
-#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
-#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
- XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -699,34 +712,34 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
/* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
/*
* o: Metadata object could be optimized. It's not corrupt, but
* we could improve on it somehow.
*/
-#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
/* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
/* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
/* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
/* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
/*
* o: IFLAG_REPAIR was set but metadata object did not need fixing or
* optimization and has therefore not been altered.
*/
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b418fe0c0679..bf2f4bc89193 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2414,9 +2414,9 @@ out_drop:
*/
void
xfs_ialloc_log_agi(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* allocation group header buffer */
- int fields) /* bitmask of fields to log */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte number */
int last; /* last byte number */
@@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry(
igeo->new_diflags2 = 0;
if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 8b5c2b709022..a7705b6a1fd3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -60,7 +60,7 @@ void
xfs_ialloc_log_agi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* allocation group header buffer */
- int fields); /* bitmask of fields to log */
+ uint32_t fields); /* bitmask of fields to log */
/*
* Read in the allocation group header (inode allocation section)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cae9708c8587..3b1b63f9d886 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -279,6 +279,25 @@ xfs_inode_to_disk_ts(
return ts;
}
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+ }
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -296,7 +315,6 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
@@ -307,8 +325,6 @@ xfs_inode_to_disk(
to->di_size = cpu_to_be64(ip->i_disk_size);
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
- to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
- to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = cpu_to_be16(ip->i_diflags);
@@ -323,11 +339,14 @@ xfs_inode_to_disk(
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
@@ -336,20 +355,40 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
- uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
case XFS_DINODE_FMT_LOCAL:
/*
- * no local regular files yet
+ * No local regular files yet.
*/
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
- return __this_address;
- }
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
@@ -358,12 +397,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
- if (whichfork == XFS_ATTR_FORK) {
- if (di_nextents > MAXAEXTNUM)
- return __this_address;
- } else if (di_nextents > MAXEXTNUM) {
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
return __this_address;
- }
break;
default:
return __this_address;
@@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -407,6 +463,9 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
@@ -437,10 +496,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
/* Fork checks carried over from xfs_iformat_fork */
- if (mode &&
- be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -497,7 +565,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
- if (dip->di_anextents)
+ if (naextents)
return __this_address;
}
@@ -639,7 +707,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
- if (extsize > MAXEXTLEN)
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
- if (cowextsize > MAXEXTLEN)
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9149f4f796fc..1a4cdf550f6d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -36,7 +36,7 @@ xfs_init_local_fork(
int64_t size)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int mem_size = size, real_size = 0;
+ int mem_size = size;
bool zero_terminate;
/*
@@ -50,8 +50,7 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -105,7 +104,7 @@ xfs_iformat_extents(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
- int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
@@ -117,8 +116,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
@@ -230,7 +229,7 @@ xfs_iformat_data_fork(
* depend on it.
*/
ip->i_df.if_format = dip->di_format;
- ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -295,14 +294,14 @@ xfs_iformat_attr_fork(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
int error = 0;
/*
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
- ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
- be16_to_cpu(dip->di_anextents));
+ ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_LOCAL:
@@ -497,12 +496,7 @@ xfs_idata_realloc(
return;
}
- /*
- * For inline data, the underlying buffer must be a multiple of 4 bytes
- * in size so that it can be logged and stay on word boundaries.
- * We enforce that here.
- */
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
@@ -744,7 +738,8 @@ xfs_iext_count_may_overflow(
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
max_exts = 10;
@@ -755,3 +750,27 @@ xfs_iext_count_may_overflow(
return 0;
}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint nr_to_add)
+{
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 3d64a3acb0ed..4f68c1f20beb 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,9 +21,9 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
- xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
@@ -40,19 +40,6 @@ struct xfs_ifork {
#define XFS_IEXT_PUNCH_HOLE_CNT (1)
/*
- * Directory entry addition can cause the following,
- * 1. Data block can be added/removed.
- * A new extent can cause extent count to increase by 1.
- * 2. Free disk block can be added/removed.
- * Same behaviour as described above for Data block.
- * 3. Dabtree blocks.
- * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
- * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
- */
-#define XFS_IEXT_DIR_MANIP_CNT(mp) \
- ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
-
-/*
* Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
* be added. One extra extent for dabtree in case a local attr is
* large enough to cause a double split. It can also cause extent
@@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
return ifp->if_format;
}
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
@@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b322db523d65..f7edd1ecf6d9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
/* Log Clients */
#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
#define XFS_LOG 0xaa
#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -114,7 +113,12 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_CUD_FORMAT 24
#define XLOG_REG_TYPE_BUI_FORMAT 25
#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_MAX 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
/*
* Flags to log operation header
@@ -237,6 +241,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_CUD 0x1243
#define XFS_LI_BUI 0x1244 /* bmbt update intent */
#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -252,7 +258,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_CUI, "XFS_LI_CUI" }, \
{ XFS_LI_CUD, "XFS_LI_CUD" }, \
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
- { XFS_LI_BUD, "XFS_LI_BUD" }
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
/*
* Inode Log Item Format definitions.
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
- uint8_t di_pad[6]; /* unused, zeroed space */
- uint16_t di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
xfs_log_timestamp_t di_atime; /* time last accessed */
xfs_log_timestamp_t di_mtime; /* time last modified */
xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
@@ -869,4 +902,36 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_flags;/* attr flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index ff69a0000817..32e216255cb0 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops;
extern const struct xlog_recover_item_ops xlog_rud_item_ops;
extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 67798ff5e14e..9975b93a7412 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res(
}
/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
* at mount time.
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
+ struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
+ unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
- resp = (struct xfs_trans_res *)M_RES(mp);
- end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
- for (; resp < end_resp; resp++) {
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
- *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ *max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index a02c5062f9b2..cb035da3f990 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,7 +16,6 @@
* and quota-limits. This is a waste in the common case, but hey ...
*/
typedef uint64_t xfs_qcnt_t;
-typedef uint16_t xfs_qwarncnt_t;
typedef uint8_t xfs_dqtype_t;
@@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t;
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */
-#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
#define XFS_DQFLAG_STRINGS \
{ XFS_DQFLAG_DIRTY, "DIRTY" }, \
@@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
* modified.
*/
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
/*
* flags for dqalloc.
*/
-#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
/*
* flags to xfs_trans_mod_dquot.
@@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 327ba25e9e17..97e9e6020596 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -886,8 +886,13 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_ag.refc.shape_changes *
- xfs_allocfree_log_count(cur->bc_mp, 1);
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -960,6 +965,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
+ cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -970,7 +976,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
@@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9eb01edbd89d..e8b322de7f3d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
- return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index cd322174dbff..2845019d31da 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,18 +34,32 @@ int
xfs_rmap_lookup_le(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
- xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags,
+ struct xfs_rmap_irec *irec,
int *stat)
{
+ int get_stat = 0;
+ int error;
+
cur->bc_rec.r.rm_startblock = bno;
- cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_blockcount = 0;
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
}
/*
@@ -251,7 +265,6 @@ out_bad_rec:
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
- int *stat;
};
/* For each rmap given, figure out if it matches the key we want. */
@@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper(
* return a match with the same owner and adjacent physical and logical
* block ranges.
*/
-int
+STATIC int
xfs_rmap_find_left_neighbor(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
@@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
*stat = 0;
@@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor(
info.high.rm_flags = flags;
info.high.rm_blockcount = 0;
info.irec = irec;
- info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_find_left_neighbor_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/* For each rmap given, figure out if it matches the key we want. */
@@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
info.high.rm_startblock = bno;
@@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range(
info.high.rm_blockcount = 0;
*stat = 0;
info.irec = irec;
- info.stat = stat;
- trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_lookup_le_range_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/*
@@ -510,7 +570,7 @@ xfs_rmap_unmap(
* for the AG headers at rm_startblock == 0 created by mkfs/growfs that
* will not ever be removed from the tree.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -518,13 +578,6 @@ xfs_rmap_unmap(
goto out_error;
}
- error = xfs_rmap_get_rec(cur, &ltrec, &i);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -786,18 +839,11 @@ xfs_rmap_map(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
&have_lt);
if (error)
goto out_error;
if (have_lt) {
- error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -1022,7 +1068,7 @@ xfs_rmap_convert(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1030,13 +1076,6 @@ xfs_rmap_convert(
goto done;
}
- error = xfs_rmap_get_rec(cur, &PREV, &i);
- if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
@@ -1140,7 +1179,7 @@ xfs_rmap_convert(
_RET_IP_);
/* reset the cursor back to PREV */
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2677,7 +2716,7 @@ xfs_rmap_record_exists(
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
(flags & XFS_RMAP_BMBT_BLOCK));
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
&has_record);
if (error)
return error;
@@ -2686,14 +2725,6 @@ xfs_rmap_record_exists(
return 0;
}
- error = xfs_rmap_get_rec(cur, &irec, &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
- }
-
*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b718ebeda372..54741a591a17 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, uint64_t owner, uint64_t offset,
- unsigned int flags, int *stat);
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
unsigned int flags, int *stat);
@@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- uint64_t owner, uint64_t offset, unsigned int flags,
- struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5740ba664867..fa180ab66b73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1008,6 +1008,7 @@ xfs_rtfree_extent(
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
@@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range(
void *priv)
{
struct xfs_rtalloc_rec rec;
- struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
xfs_rtblock_t high_key;
@@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = rtstart;
rec.ar_extcount = rtend - rtstart + 1;
- error = fn(tp, &rec, priv);
+ error = fn(mp, tp, &rec, priv);
if (error)
break;
}
@@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
@@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all(
struct xfs_rtalloc_rec keys[2];
keys[0].ar_startext = 0;
- keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
keys[0].ar_extcount = keys[1].ar_extcount = 0;
- return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
}
/* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index f4e84aa1d50a..a20cade590e9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -31,15 +31,66 @@
*/
/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
+{
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
+
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
+}
+
+/*
* We support all XFS versions newer than a v4 superblock with V2 directories.
*/
bool
xfs_sb_good_version(
struct xfs_sb *sbp)
{
- /* all v5 filesystems are supported */
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
if (xfs_sb_is_v5(sbp))
- return true;
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
/* versions prior to v4 are not supported */
if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
@@ -51,12 +102,6 @@ xfs_sb_good_version(
if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
return false;
- /* And must not have any unknown v4 feature bits set */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
/* It's a supported v4 filesystem */
return true;
}
@@ -70,6 +115,8 @@ xfs_sb_version_to_features(
/* optional V4 features */
if (sbp->sb_rblocks > 0)
features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
features |= XFS_FEAT_ATTR;
if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
@@ -124,6 +171,9 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_BIGTIME;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
return features;
}
@@ -262,12 +312,15 @@ xfs_validate_sb_common(
bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
return -EWRONGFS;
}
@@ -911,6 +964,11 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1135,6 +1193,8 @@ xfs_fs_geometry(
} else {
geo->logsectsize = BBSIZE;
}
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 25c4cab58851..c4381388c0c1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
/*
* Values for t_flags.
*/
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6f83d9b306ee..e9913c2c5a24 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,15 +56,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). With reflink, there are four trees (refcountbt). The number of
- * blocks reserved is based on the formula:
+ * (rmapbt). The number of blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
struct xfs_mount *mp,
uint num_ops)
{
@@ -73,13 +72,24 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_has_reflink(mp))
- blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
{
return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res(
{
uint res, size = 0;
- res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
*/
static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
- rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
/*
* In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
- * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_write_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
} else {
t2 = 0;
}
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
}
/*
@@ -299,33 +362,62 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
- * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
}
/*
@@ -349,7 +441,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -427,7 +519,7 @@ xfs_calc_remove_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void)
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- return xfs_calc_write_reservation(mp) +
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
/*
* Syncing the incore super block changes to disk.
* the super block to reflect the changes: sector size
@@ -814,36 +914,18 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
-
- /*
- * In the early days of rmap+reflink, we always set the rmap maxlevels
- * to 9 even if the AG was small enough that it would never grow to
- * that height. Transaction reservation sizes influence the minimum
- * log size calculation, which influences the size of the log that mkfs
- * creates. Use the old value here to ensure that newly formatted
- * small filesystems will mount on older kernels.
- */
- if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+ int logcount_adj = 0;
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
*/
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_itruncate.tr_logcount =
- XFS_ITRUNCATE_LOG_COUNT_REFLINK;
- else
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -899,11 +981,9 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
@@ -930,6 +1010,19 @@ xfs_trans_resv_calc(
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
- /* Put everything back the way it was. This goes at the end. */
- mp->m_rmap_maxlevels = rmap_maxlevels;
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index fc4e9b369a3a..0554b9d775d2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -83,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b6da06b40989..373f64a492a4 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
-typedef int32_t xfs_extnum_t; /* # of extents in a file */
-typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
@@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t;
#define NULLAGINO ((xfs_agino_t)-1)
/*
- * Max values for extlen, extnum, aextnum.
- */
-#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
-#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
-#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
-
-/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a4cbbc346f60..285995ba3947 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -133,29 +133,13 @@ xchk_bmap_get_rmap(
if (info->is_shared) {
error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
owner, offset, rflags, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- goto out;
+ } else {
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
}
-
- /*
- * Otherwise, use the (faster) regular lookup.
- */
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
- offset, rflags, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
return false;
- if (!has_rmap)
- goto out;
- error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
-
-out:
if (!has_rmap)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -350,7 +334,7 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > MAXEXTLEN)
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index bf1f3607d0b6..97b54ac3075f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -23,6 +23,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index eac15af7b08c..51820b40ab1c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -232,7 +232,8 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
- uint32_t nextents;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -390,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+
/* di_nextents */
- nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -411,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
- if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
@@ -423,19 +426,18 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
- nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- if (nextents > fork_recs)
+ if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
- if (nextents <= fork_recs)
+ if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
- if (nextents != 0)
+ if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents < be32_to_cpu(dip->di_nextents))
+ if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents != be16_to_cpu(dip->di_anextents))
+ if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8fa012057405..0a3bde64c675 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -40,6 +40,7 @@ xchk_setup_rt(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -48,10 +49,10 @@ xchk_rtbitmap_rec(
xfs_rtblock_t startblock;
xfs_rtblock_t blockcount;
- startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+ startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
- if (!xfs_verify_rtext(sc->mp, startblock, blockcount))
+ if (!xfs_verify_rtext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -114,7 +115,7 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 5c52ee869272..3df9c1782ead 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,12 +10,12 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_trans.h"
#include <linux/posix_acl_xattr.h>
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index bb6abdcb265d..263404d0bfda 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return NULL;
+ return 0;
}
-# define xfs_set_acl NULL
static inline void xfs_forget_acl(struct inode *inode, const char *name)
{
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 90b7f4d127de..8ec38b25187b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -464,7 +464,7 @@ xfs_discard_folio(
int error;
if (xfs_is_shutdown(mp))
- goto out_invalidate;
+ return;
xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
@@ -474,8 +474,6 @@ xfs_discard_folio(
i_blocks_per_folio(inode, folio) - pageoff_fsb);
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
-out_invalidate:
- iomap_invalidate_folio(folio, offset, folio_size(folio) - offset);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -538,11 +536,11 @@ xfs_vm_bmap(
}
STATIC int
-xfs_vm_readpage(
+xfs_vm_read_folio(
struct file *unused,
- struct page *page)
+ struct folio *folio)
{
- return iomap_readpage(page, &xfs_read_iomap_ops);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops);
}
STATIC void
@@ -564,11 +562,11 @@ xfs_iomap_swapfile_activate(
}
const struct address_space_operations xfs_address_space_operations = {
- .readpage = xfs_vm_readpage,
+ .read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000000..e8ac88d9fd14
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,824 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+STATIC void
+xfs_attri_item_free(
+ struct xfs_attri_log_item *attrip)
+{
+ kmem_free(attrip->attri_item.li_lv_shadow);
+ kvfree(attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+ struct xfs_attri_log_item *attrip)
+{
+ ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+ if (!atomic_dec_and_test(&attrip->attri_refcount))
+ return;
+
+ xfs_trans_ail_delete(&attrip->attri_item, 0);
+ xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(attrip->attri_name_len);
+
+ if (!attrip->attri_value_len)
+ return;
+
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(attrip->attri_value_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use 1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+ attrip->attri_format.alfi_size = 1;
+
+ /*
+ * This size accounting must be done before copying the attrip into the
+ * iovec. If we do it after, the wrong size will be recorded to the log
+ * and we trip across assertion checks for bad region sizes later during
+ * the log recovery.
+ */
+
+ ASSERT(attrip->attri_name_len > 0);
+ attrip->attri_format.alfi_size++;
+
+ if (attrip->attri_value_len > 0)
+ attrip->attri_format.alfi_size++;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+ &attrip->attri_format,
+ sizeof(struct xfs_attri_log_format));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME,
+ attrip->attri_name,
+ attrip->attri_name_len);
+ if (attrip->attri_value_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+ attrip->attri_value,
+ attrip->attri_value_len);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item. Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+ struct xfs_mount *mp,
+ uint32_t name_len,
+ uint32_t value_len)
+
+{
+ struct xfs_attri_log_item *attrip;
+ uint32_t buffer_size = name_len + value_len;
+
+ if (buffer_size) {
+ /*
+ * This could be over 64kB in length, so we have to use
+ * kvmalloc() for this. But kvmalloc() utterly sucks, so we
+ * use own version.
+ */
+ attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) +
+ buffer_size);
+ } else {
+ attrip = kmem_cache_alloc(xfs_attri_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ }
+ memset(attrip, 0, sizeof(struct xfs_attri_log_item));
+
+ attrip->attri_name_len = name_len;
+ if (name_len)
+ attrip->attri_name = ((char *)attrip) +
+ sizeof(struct xfs_attri_log_item);
+ else
+ attrip->attri_name = NULL;
+
+ attrip->attri_value_len = value_len;
+ if (value_len)
+ attrip->attri_value = ((char *)attrip) +
+ sizeof(struct xfs_attri_log_item) +
+ name_len;
+ else
+ attrip->attri_value = NULL;
+
+ xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+ &xfs_attri_item_ops);
+ attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+ atomic_set(&attrip->attri_refcount, 2);
+
+ return attrip;
+}
+
+/*
+ * Copy an attr format buffer from the given buf, and into the destination attr
+ * format structure.
+ */
+STATIC int
+xfs_attri_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_attri_log_format *dst_attr_fmt)
+{
+ struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
+ size_t len;
+
+ len = sizeof(struct xfs_attri_log_format);
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
+ return 0;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+ kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kmem_free(attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+ attrdp->attrd_format.alfd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+ &attrdp->attrd_format,
+ sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+
+ xfs_attri_release(attrdp->attrd_attrip);
+ xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty.. An attr operation may be a set or a remove. Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+ struct xfs_attr_item *attr,
+ struct xfs_attrd_log_item *attrdp)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ error = -EAGAIN;
+out:
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the ATTRI and frees the ATTRD
+ * 2.) shuts down the filesystem
+ */
+ args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+ /*
+ * attr intent/done items are null when logged attributes are disabled
+ */
+ if (attrdp)
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+ struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip,
+ struct xfs_attr_item *attr)
+{
+ struct xfs_attri_log_format *attrp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+ /*
+ * At this point the xfs_attr_item has been constructed, and we've
+ * created the log intent. Fill in the attri log item and log format
+ * structure with fields from this xfs_attr_item
+ */
+ attrp = &attrip->attri_format;
+ attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ attrp->alfi_op_flags = attr->xattri_op_flags;
+ attrp->alfi_value_len = attr->xattri_da_args->valuelen;
+ attrp->alfi_name_len = attr->xattri_da_args->namelen;
+ attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter;
+
+ memcpy(attrip->attri_name, attr->xattri_da_args->name,
+ attr->xattri_da_args->namelen);
+ memcpy(attrip->attri_value, attr->xattri_da_args->value,
+ attr->xattri_da_args->valuelen);
+ attrip->attri_name_len = attr->xattri_da_args->namelen;
+ attrip->attri_value_len = attr->xattri_da_args->valuelen;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attr_item *attr;
+
+ ASSERT(count == 1);
+
+ if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return NULL;
+
+ /*
+ * Each attr item only performs one attribute operation at a time, so
+ * this is a list of one
+ */
+ list_for_each_entry(attr, items, xattri_list) {
+ attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen,
+ attr->xattri_da_args->valuelen);
+ if (attrip == NULL)
+ return NULL;
+
+ xfs_trans_add_item(tp, &attrip->attri_item);
+ xfs_attr_log_item(tp, attrip, attr);
+ }
+
+ return &attrip->attri_item;
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_attr_item *attr;
+ struct xfs_attrd_log_item *done_item = NULL;
+ int error;
+
+ attr = container_of(item, struct xfs_attr_item, xattri_list);
+ if (done)
+ done_item = ATTRD_ITEM(done);
+
+ /*
+ * Always reset trans after EAGAIN cycle
+ * since the transaction is new
+ */
+ attr->xattri_da_args->trans = tp;
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error != -EAGAIN)
+ kmem_free(attr);
+
+ return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_attr_item *attr;
+
+ attr = container_of(item, struct xfs_attr_item, xattri_list);
+ kmem_free(attr);
+}
+
+STATIC xfs_lsn_t
+xfs_attri_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+
+ /*
+ * The attrip refers to xfs_attr_item memory to log the name and value
+ * with the intent item. This already occurred when the intent was
+ * committed so these fields are no longer accessed. Clear them out of
+ * caution since we're about to free the xfs_attr_item.
+ */
+ attrip->attri_name = NULL;
+ attrip->attri_value = NULL;
+
+ /*
+ * The ATTRI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.
+ */
+ return lsn;
+}
+
+STATIC bool
+xfs_attri_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attrp)
+{
+ unsigned int op = attrp->alfi_op_flags &
+ XFS_ATTR_OP_FLAGS_TYPE_MASK;
+
+ if (attrp->__pad != 0)
+ return false;
+
+ /* alfi_op_flags should be either a set or remove */
+ switch (op) {
+ case XFS_ATTR_OP_FLAGS_SET:
+ case XFS_ATTR_OP_FLAGS_REPLACE:
+ case XFS_ATTR_OP_FLAGS_REMOVE:
+ break;
+ default:
+ return false;
+ }
+
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+
+ if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_name_len == 0))
+ return false;
+
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_item *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res tres;
+ struct xfs_attri_log_format *attrp;
+ int error, ret = 0;
+ int total;
+ int local;
+ struct xfs_attrd_log_item *done_item = NULL;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len))
+ return -EFSCORRUPTED;
+
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ if (error)
+ return error;
+
+ attr = kmem_zalloc(sizeof(struct xfs_attr_item) +
+ sizeof(struct xfs_da_args), KM_NOFS);
+ args = (struct xfs_da_args *)(attr + 1);
+
+ attr->xattri_da_args = args;
+ attr->xattri_op_flags = attrp->alfi_op_flags;
+
+ args->dp = ip;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->name = attrip->attri_name;
+ args->namelen = attrp->alfi_name_len;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->attr_filter = attrp->alfi_attr_flags;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT;
+
+ switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) {
+ case XFS_ATTR_OP_FLAGS_SET:
+ case XFS_ATTR_OP_FLAGS_REPLACE:
+ args->value = attrip->attri_value;
+ args->valuelen = attrp->alfi_value_len;
+ args->total = xfs_attr_calc_size(args, &local);
+ if (xfs_inode_hasattr(args->dp))
+ attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+ else
+ attr->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTR_OP_FLAGS_REMOVE:
+ if (!xfs_inode_hasattr(args->dp))
+ goto out;
+ attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ goto out;
+
+ args->trans = tp;
+ done_item = xfs_trans_get_attrd(tp, attrip);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ ret = xfs_xattri_finish_update(attr, done_item);
+ if (ret == -EAGAIN) {
+ /* There's more work to do, so add it to this transaction */
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+ } else
+ error = ret;
+
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+out_unlock:
+ if (attr->xattri_leaf_bp)
+ xfs_buf_relse(attr->xattri_leaf_bp);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+out:
+ if (ret != -EAGAIN)
+ kmem_free(attr);
+ return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_attrd_log_item *attrdp;
+ struct xfs_attri_log_item *old_attrip;
+ struct xfs_attri_log_item *new_attrip;
+ struct xfs_attri_log_format *new_attrp;
+ struct xfs_attri_log_format *old_attrp;
+
+ old_attrip = ATTRI_ITEM(intent);
+ old_attrp = &old_attrip->attri_format;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ attrdp = xfs_trans_get_attrd(tp, old_attrip);
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len,
+ old_attrp->alfi_value_len);
+ new_attrp = &new_attrip->attri_format;
+
+ new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+ new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags;
+
+ memcpy(new_attrip->attri_name, old_attrip->attri_name,
+ new_attrip->attri_name_len);
+
+ if (new_attrip->attri_value_len > 0)
+ memcpy(new_attrip->attri_value, old_attrip->attri_value,
+ new_attrip->attri_value_len);
+
+ xfs_trans_add_item(tp, &new_attrip->attri_item);
+ set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+ return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attri_log_format *attri_formatp;
+ int region = 0;
+
+ attri_formatp = item->ri_buf[region].i_addr;
+
+ /* Validate xfs_attri_log_format */
+ if (!xfs_attri_validate(mp, attri_formatp)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ /* memory alloc failure will cause replay to abort */
+ attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len,
+ attri_formatp->alfi_value_len);
+ if (attrip == NULL)
+ return -ENOMEM;
+
+ error = xfs_attri_copy_format(&item->ri_buf[region],
+ &attrip->attri_format);
+ if (error)
+ goto out;
+
+ region++;
+ memcpy(attrip->attri_name, item->ri_buf[region].i_addr,
+ attrip->attri_name_len);
+
+ if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (attrip->attri_value_len > 0) {
+ region++;
+ memcpy(attrip->attri_value, item->ri_buf[region].i_addr,
+ attrip->attri_value_len);
+ }
+
+ /*
+ * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+ * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+ * directly and drop the ATTRI reference. Note that
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+ xfs_attri_release(attrip);
+ return 0;
+out:
+ xfs_attri_item_free(attrip);
+ return error;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip)
+{
+ struct xfs_attrd_log_item *attrdp;
+
+ ASSERT(tp != NULL);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ xfs_trans_add_item(tp, &attrdp->attrd_item);
+ return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ if (!intent)
+ return NULL;
+
+ return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attrd_log_format *attrd_formatp;
+
+ attrd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_ATTRI,
+ attrd_formatp->alfd_alf_id);
+ return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_attri_item_size,
+ .iop_format = xfs_attri_item_format,
+ .iop_unpin = xfs_attri_item_unpin,
+ .iop_committed = xfs_attri_item_committed,
+ .iop_release = xfs_attri_item_release,
+ .iop_recover = xfs_attri_item_recover,
+ .iop_match = xfs_attri_item_match,
+ .iop_relog = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+ .item_type = XFS_LI_ATTRI,
+ .commit_pass2 = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_attrd_item_size,
+ .iop_format = xfs_attrd_item_format,
+ .iop_release = xfs_attrd_item_release,
+ .iop_intent = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+ .item_type = XFS_LI_ATTRD,
+ .commit_pass2 = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000000..c3b779f82adb
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef __XFS_ATTR_ITEM_H__
+#define __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * This is the "attr intention" log item. It is used to log the fact that some
+ * extended attribute operations need to be processed. An operation is
+ * currently either a set or remove. Set or remove operations are described by
+ * the xfs_attr_item which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure. During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_item until they are committed. They are freed when
+ * the xfs_attr_item itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+ struct xfs_log_item attri_item;
+ atomic_t attri_refcount;
+ int attri_name_len;
+ int attri_value_len;
+ void *attri_name;
+ void *attri_value;
+ struct xfs_attri_log_format attri_format;
+};
+
+/*
+ * This is the "attr done" log item. It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+ struct xfs_log_item attrd_item;
+ struct xfs_attri_log_item *attrd_attrip;
+ struct xfs_attrd_log_format attrd_format;
+};
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d1e5134cebe..90a14e85e76d 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 761dde155099..51f66e982484 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -39,6 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -54,10 +55,11 @@ xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
+ if (!atomic_dec_and_test(&buip->bui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&buip->bui_item, 0);
+ xfs_bui_item_free(buip);
}
@@ -198,14 +200,24 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
+static struct xfs_log_item *
+xfs_bud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
.iop_release = xfs_bud_item_release,
+ .iop_intent = xfs_bud_item_intent,
};
static struct xfs_bud_log_item *
@@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
* 1.) releases the BUI and frees the BUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
@@ -506,6 +518,8 @@ xfs_bui_item_recover(
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
@@ -584,6 +598,7 @@ xfs_bui_item_relog(
}
static const struct xfs_item_ops xfs_bui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index eb2e387ba528..52be58372c63 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -119,14 +119,14 @@ retry:
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
- * If the old value was close enough to MAXEXTLEN that
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+ if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+ ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -839,9 +839,11 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+ * limit.
*/
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ resblks = min_t(xfs_fileoff_t, (e - s),
+ (XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resblks;
@@ -857,6 +859,9 @@ xfs_alloc_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto error;
@@ -912,6 +917,8 @@ xfs_unmap_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1193,6 +1200,8 @@ xfs_insert_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
@@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e11e9ef2338f..4d8a6aece995 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
/* kernel only definitions */
+struct xfs_buf;
+struct xfs_mount;
+
/* buf log item flags */
-#define XFS_BLI_HOLD 0x01
-#define XFS_BLI_DIRTY 0x02
-#define XFS_BLI_STALE 0x04
-#define XFS_BLI_LOGGED 0x08
-#define XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE 0x20
-#define XFS_BLI_INODE_BUF 0x40
-#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_HOLD (1u << 0)
+#define XFS_BLI_DIRTY (1u << 1)
+#define XFS_BLI_STALE (1u << 2)
+#define XFS_BLI_LOGGED (1u << 3)
+#define XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE (1u << 5)
+#define XFS_BLI_INODE_BUF (1u << 6)
+#define XFS_BLI_ORDERED (1u << 7)
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -28,11 +31,6 @@
{ XFS_BLI_INODE_BUF, "INODE_BUF" }, \
{ XFS_BLI_ORDERED, "ORDERED" }
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 0191de8ce9ce..c6fe3f6ebb6b 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -114,7 +114,7 @@ xfs_trim_extents(
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -152,8 +152,8 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
- unsigned int granularity = q->limits.discard_granularity;
+ unsigned int granularity =
+ bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
@@ -162,7 +162,7 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
return -EOPNOTSUPP;
/*
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5afedcbc78c7..5a6c3c3c4de2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer(
res->timer = xfs_dquot_set_timeout(mp,
ktime_get_real_seconds() + qlim->time);
} else {
- if (res->timer == 0)
- res->warnings = 0;
- else
- res->timer = 0;
+ res->timer = 0;
}
}
@@ -322,6 +319,9 @@ xfs_dquot_disk_alloc(
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, quotip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -589,10 +589,6 @@ xfs_dquot_from_disk(
dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
- dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
- dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
- dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
-
dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
@@ -634,9 +630,9 @@ xfs_dquot_to_disk(
ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
- ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
- ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
- ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+ ddqp->d_bwarns = 0;
+ ddqp->d_iwarns = 0;
+ ddqp->d_rtbwarns = 0;
ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 6b5e3cf40c8b..80c8f851a2f3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -44,14 +44,6 @@ struct xfs_dquot_res {
* in seconds since the Unix epoch.
*/
time64_t timer;
-
- /*
- * For root dquots, this is the maximum number of warnings that will
- * be issued for this quota type. Otherwise, this is the number of
- * warnings issued against this quota. Note that none of this is
- * implemented.
- */
- xfs_qwarncnt_t warnings;
};
static inline bool
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 749fd18c4f32..296faa41d81d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REDUCE_MAX_IEXTENTS,
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
XFS_RANDOM_AG_RESV_FAIL,
+ XFS_RANDOM_LARP,
+ XFS_RANDOM_DA_LEAF_SPLIT,
+ XFS_RANDOM_ATTR_LEAF_TO_NODE,
};
struct xfs_errortag_attr {
@@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+ XFS_ERRORTAG_ATTR_LIST(larp),
+ XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+ XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5735d5ea87ee..5191e9145e55 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
* a panic by setting xfs_panic_mask in a sysctl.
*/
-#define XFS_NO_PTAG 0
-#define XFS_PTAG_IFLUSH 0x00000001
-#define XFS_PTAG_LOGRES 0x00000002
-#define XFS_PTAG_AILDELETE 0x00000004
-#define XFS_PTAG_ERROR_REPORT 0x00000008
-#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
-#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
-#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
-#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_NO_PTAG 0u
+#define XFS_PTAG_IFLUSH (1u << 0)
+#define XFS_PTAG_LOGRES (1u << 1)
+#define XFS_PTAG_AILDELETE (1u << 2)
+#define XFS_PTAG_ERROR_REPORT (1u << 3)
+#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4)
+#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5)
+#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6)
+#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
+#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
#define XFS_PTAG_STRINGS \
{ XFS_NO_PTAG, "none" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 0e50f2c9348e..765be054dffe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -58,10 +58,11 @@ xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
+ if (!atomic_dec_and_test(&efip->efi_refcount))
+ return;
+
+ xfs_trans_ail_delete(&efip->efi_item, 0);
+ xfs_efi_item_free(efip);
}
/*
@@ -306,11 +307,20 @@ xfs_efd_item_release(
xfs_efd_item_free(efdp);
}
+static struct xfs_log_item *
+xfs_efd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efd_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_release = xfs_efd_item_release,
+ .iop_intent = xfs_efd_item_intent,
};
/*
@@ -380,7 +390,7 @@ xfs_trans_free_extent(
* 1.) releases the EFI and frees the EFD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
@@ -688,6 +698,7 @@ xfs_efi_item_relog(
}
static const struct xfs_item_ops xfs_efi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5bddb1e9e0b3..a60632ecc3f0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -225,7 +225,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -310,7 +310,7 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -513,7 +513,7 @@ xfs_file_dio_write_aligned(
struct kiocb *iocb,
struct iov_iter *from)
{
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -534,7 +534,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, 0);
+ &xfs_dio_write_ops, 0, NULL, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned(
{
size_t isize = i_size_read(VFS_I(ip));
size_t count = iov_iter_count(from);
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
ssize_t ret;
@@ -612,7 +612,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags, 0);
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
@@ -655,7 +655,7 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
loff_t pos;
@@ -694,13 +694,11 @@ xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
bool cleared_space = false;
- int iolock;
+ unsigned int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
@@ -767,9 +765,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -1167,12 +1163,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
- return 0;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1181,7 +1175,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6a3ce0f6dc9e..be9bcf8a1f99 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -128,11 +128,12 @@ xfs_filestream_pick_ag(
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10e1cb71439e..bb23199f65c3 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -450,11 +450,11 @@ xfs_getfsmap_logdev(
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
@@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
- error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
@@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
info->last = true;
ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
- error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
if (error)
goto err;
err:
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 68f74549fa22..888839e75d11 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -349,10 +349,7 @@ xfs_fs_counts(
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -512,7 +509,7 @@ xfs_fs_goingdown(
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f62fa652c2fd..4d0a98f920ca 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = {
#endif
#ifdef DEBUG
.pwork_threads = -1, /* automatic thread detection */
+ .larp = false, /* log attribute replay */
#endif
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bffd6eb0b298..5269354b1b69 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t freertx;
if (!XFS_IS_REALTIME_INODE(ip))
return false;
- freertx = READ_ONCE(mp->m_sb.sb_frextents);
- return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+ if (__percpu_counter_compare(&mp->m_frextents,
+ mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ return false;
}
#else
# define xfs_inodegc_want_queue_rt_file(ip) (false)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 508e184e3b8f..b05314d48176 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 39ae53efb3ab..b2879870a17e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok(
* parent locking. Care must be taken to ensure we don't overrun the subclass
* storage fields in the class mask we build.
*/
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+ uint lock_mode,
+ uint subclass)
{
- int class = 0;
+ uint class = 0;
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
@@ -464,7 +466,10 @@ xfs_lock_inodes(
int inodes,
uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
+ int attempts = 0;
+ uint i;
+ int j;
+ bool try_lock;
struct xfs_log_item *lp;
/*
@@ -489,9 +494,9 @@ xfs_lock_inodes(
} else if (lock_mode & XFS_MMAPLOCK_EXCL)
ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
- try_lock = 0;
- i = 0;
again:
+ try_lock = false;
+ i = 0;
for (; i < inodes; i++) {
ASSERT(ips[i]);
@@ -506,7 +511,7 @@ again:
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
- try_lock++;
+ try_lock = true;
}
}
@@ -546,8 +551,6 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
}
- i = 0;
- try_lock = 0;
goto again;
}
}
@@ -1024,11 +1027,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
@@ -1242,11 +1240,6 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto error_return;
-
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -3212,35 +3205,6 @@ retry:
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
- *
- * Extent count overflow check:
- *
- * From the perspective of src_dp, a rename operation is essentially a
- * directory entry remove operation. Hence the only place where we check
- * for extent count overflow for src_dp is in
- * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
- * -ENOSPC when it detects a possible extent count overflow and in
- * response, the higher layers of directory handling code do the
- * following:
- * 1. Data/Free blocks: XFS lets these blocks linger until a
- * future remove operation removes them.
- * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
- * Leaf space and unmaps the last block.
- *
- * For target_dp, there are two cases depending on whether the
- * destination directory entry exists or not.
- *
- * When destination directory entry does not exist (i.e. target_ip ==
- * NULL), extent count overflow check is performed only when transaction
- * has a non-zero sized space reservation associated with it. With a
- * zero-sized space reservation, XFS allows a rename operation to
- * continue only when the directory has sufficient free space in its
- * data/leaf/free space blocks to hold the new entry.
- *
- * When destination directory entry exists (i.e. target_ip != NULL), all
- * we need to do is change the inode number associated with the already
- * existing entry. Hence there is no need to perform an extent count
- * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3251,12 +3215,6 @@ retry:
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
- } else {
- error = xfs_iext_count_may_overflow(target_dp,
- XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
}
} else {
/*
@@ -3424,18 +3382,12 @@ retry:
* inode number of the whiteout inode rather than removing it
* altogether.
*/
- if (wip) {
+ if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else {
- /*
- * NOTE: We don't need to check for extent count overflow here
- * because the dir remove name code will leave the dir block in
- * place if the extent count would overflow.
- */
+ else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
- }
if (error)
goto out_trans_cancel;
@@ -3517,8 +3469,8 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: detected corrupt incore inode %Lu, "
- "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+ "%s: detected corrupt incore inode %llu, "
+ "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_nblocks, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 740ab13d1aa2..7be6f8e705ab 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
*/
-#define XFS_IOLOCK_EXCL (1<<0)
-#define XFS_IOLOCK_SHARED (1<<1)
-#define XFS_ILOCK_EXCL (1<<2)
-#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_MMAPLOCK_EXCL (1<<4)
-#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_IOLOCK_EXCL (1u << 0)
+#define XFS_IOLOCK_SHARED (1u << 1)
+#define XFS_ILOCK_EXCL (1u << 2)
+#define XFS_ILOCK_SHARED (1u << 3)
+#define XFS_MMAPLOCK_EXCL (1u << 4)
+#define XFS_MMAPLOCK_SHARED (1u << 5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_MAX_SUBCLASS 3
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000u
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
#define XFS_MMAPLOCK_MAX_SUBCLASS 3
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u
#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6
-#define XFS_ILOCK_RTSUM_VAL 7
-#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL 6u
+#define XFS_ILOCK_RTSUM_VAL 7u
+#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9e6ef55cf29e..721def0639fd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
*nvecs += 1;
}
break;
@@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
*nvecs += 1;
}
break;
@@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data, data_bytes);
- ilf->ilf_dsize = (unsigned)data_bytes;
+ ip->i_df.if_u1.if_data,
+ ip->i_df.if_bytes);
+ ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
- data_bytes);
- ilf->ilf_asize = (unsigned)data_bytes;
+ ip->i_afp->if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode(
}
}
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+ to->di_nrext64_pad = 0;
+ } else {
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+ }
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -374,7 +378,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
@@ -386,8 +389,6 @@ xfs_inode_to_log_dinode(
to->di_size = ip->i_disk_size;
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
- to->di_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = ip->i_diflags;
@@ -407,11 +408,14 @@ xfs_inode_to_log_dinode(
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 239dd2e3384e..d28ffaebd067 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
return ts;
}
+static inline bool xfs_log_dinode_has_large_extent_counts(
+ const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ if (xfs_log_dinode_has_large_extent_counts(from)) {
+ to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+ to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+ to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+ } else {
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ }
+
+}
+
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
to->di_forkoff = from->di_forkoff;
to->di_aformat = from->di_aformat;
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+ struct xfs_mount *mp,
+ struct xfs_log_dinode *ldip)
+{
+ xfs_extnum_t nextents;
+ xfs_aextnum_t anextents;
+
+ if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+ if (!xfs_has_large_extent_counts(mp) ||
+ (ldip->di_nrext64_pad != 0)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode large extent count format",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+ ldip->di_ino, xfs_has_large_extent_counts(mp),
+ ldip->di_nrext64_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_big_nextents;
+ anextents = ldip->di_big_anextents;
+ } else {
+ if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode di_v3_pad",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_v3_pad 0x%llx",
+ ldip->di_ino, ldip->di_v3_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_nextents;
+ anextents = ldip->di_anextents;
+ }
+
+ if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+ ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+ anextents, ldip->di_nblocks);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
STATIC int
@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for regular file",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE) &&
(ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for directory",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
}
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
+
+ error = xlog_dinode_verify_extent_counts(mp, ldip);
+ if (error)
goto out_release;
- }
+
if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+ "Bad inode 0x%llx, di_forkoff 0x%x",
+ in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+ mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
+ "Bad inode 0x%llx log dinode size 0x%x",
+ in_f->ilf_ino, item->ri_buf[1].i_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2(
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
+ (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
switch (fields & XFS_ILOG_DFORK) {
case XFS_ILOG_DDATA:
@@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2(
}
len = item->ri_buf[attr_index].i_len;
src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
+ ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
case XFS_ILOG_ADATA:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 83481005317a..0e5cb7936206 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,8 +37,6 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -813,6 +813,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
+ if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+ breq->flags |= XFS_IBULK_NREXT64;
+
return 0;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ca25ed89b706..2f54b701eead 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
#include "xfs_itable.h"
#include "xfs_fsops.h"
#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e552ce541ec2..5a393259a3a3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
return error;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
*/
plen = prev.br_blockcount;
while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
- if (plen > MAXEXTLEN / 2 ||
+ if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
isnullstartblock(got.br_startblock) ||
got.br_startoff + got.br_blockcount != prev.br_startoff ||
got.br_startblock + got.br_blockcount != prev.br_startblock)
@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
/*
* If the size of the extents is greater than half the maximum extent
* length, then use the current offset as the basis. This ensures that
- * for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width
- * alignment of real extents.
+ * for large files the preallocation size always extends to
+ * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+ * unit/width alignment of real extents.
*/
alloc_blocks = plen * 2;
- if (alloc_blocks > MAXEXTLEN)
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
qblocks = alloc_blocks;
/*
- * MAXEXTLEN is not a power of two value but we round the prealloc down
- * to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported
- * prealloc size, we round up first, apply appropriate throttling,
- * round down and cap the value to MAXEXTLEN.
+ * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+ * down to the nearest power of two value after throttling. To prevent
+ * the round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling, round
+ * down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
- alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
- if (alloc_blocks > MAXEXTLEN)
- alloc_blocks = MAXEXTLEN;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+ alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
+ * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b34e8e4344a8..e912b7fee714 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
@@ -209,7 +211,6 @@ xfs_generic_create(
if (unlikely(error))
goto out_cleanup_inode;
-#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -220,7 +221,6 @@ xfs_generic_create(
if (error)
goto out_cleanup_inode;
}
-#endif
xfs_setup_iops(ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c08c79d9e311..f74c9fff72bb 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
+ xfs_extnum_t nextents;
int error = -EINVAL;
if (xfs_internal_inum(mp, ino))
@@ -102,7 +103,13 @@ xfs_bulkstat_one_int(
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = ip->i_extsize;
- buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+ nextents = xfs_ifork_nextents(&ip->i_df);
+ if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+ buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+ else
+ buf->bs_extents64 = nextents;
+
xfs_bulkstat_health(ip, buf);
buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
@@ -256,6 +263,7 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
+ unsigned int iwalk_flags = 0;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -279,7 +287,10 @@ xfs_bulkstat(
if (error)
goto out;
- error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+ if (breq->flags & XFS_IBULK_SAME_AG)
+ iwalk_flags |= XFS_IWALK_SAME_AG;
+
+ error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 7078d10c9b12..e2d0eba43f35 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -17,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 37a795f03267..83699089755e 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG (0x1)
+#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 499e15b24215..9dc748abdf33 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -49,7 +49,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
xlog_grant_push_ail(
@@ -61,10 +60,6 @@ xlog_sync(
struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr);
-STATIC void
xlog_verify_grant_tail(
struct xlog *log);
STATIC void
@@ -77,7 +72,6 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog);
#else
-#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b)
@@ -90,6 +84,62 @@ xlog_iclogs_empty(
static int
xfs_log_cover(struct xfs_mount *);
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -322,30 +372,6 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
-}
-
bool
xfs_log_writable(
struct xfs_mount *mp)
@@ -395,8 +421,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -434,10 +458,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -445,15 +468,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -901,12 +922,22 @@ xlog_write_unmount_record(
struct xlog *log,
struct xlog_ticket *ticket)
{
- struct xfs_unmount_log_format ulf = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &ulf,
- .i_len = sizeof(ulf),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
@@ -914,10 +945,14 @@ xlog_write_unmount_record(
.lv_iovecp = &reg,
};
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
+
/* account for space used by record data */
- ticket->t_curr_res -= sizeof(ulf);
+ ticket->t_curr_res -= sizeof(unmount_rec);
- return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
+ return xlog_write(log, NULL, &vec, ticket, reg.i_len);
}
/*
@@ -933,7 +968,7 @@ xlog_unmount_write(
struct xlog_ticket *tic = NULL;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
@@ -1584,9 +1619,6 @@ xlog_alloc_log(
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
-#ifdef DEBUG
- log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1602,7 +1634,7 @@ xlog_alloc_log(
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -2111,63 +2143,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2220,187 +2200,226 @@ xlog_print_trans(
}
}
+static inline void
+xlog_write_iovec(
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ void *data,
+ uint32_t write_len,
+ int *bytes_left,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+ ASSERT(*log_offset % sizeof(int32_t) == 0);
+ ASSERT(write_len % sizeof(int32_t) == 0);
+
+ memcpy(iclog->ic_datap + *log_offset, data, write_len);
+ *log_offset += write_len;
+ *bytes_left -= write_len;
+ (*record_cnt)++;
+ *data_cnt += write_len;
+}
+
/*
- * Calculate the potential space needed by the log vector. We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
*/
-static int
-xlog_write_calc_vec_length(
+static void
+xlog_write_full(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector,
- uint optype)
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- struct xfs_log_vec *lv;
- int headers = 0;
- int len = 0;
- int i;
-
- if (optype & XLOG_START_TRANS)
- headers++;
-
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
+ int index;
- headers += lv->lv_niovecs;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
- }
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ reg->i_len, len, record_cnt, data_cnt);
}
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
-}
-
-static void
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
-{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
}
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
- struct xlog *log,
- struct xlog_op_header *ophdr,
+static int
+xlog_write_get_more_iclog_space(
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog *log = iclog->ic_log;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
static int
-xlog_write_setup_copy(
+xlog_write_partial(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog_op_header *ophdr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ /* walk the logvec, copying until we run out of space in the iclog */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ uint32_t reg_offset = 0;
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return error;
+ }
- return sizeof(struct xlog_op_header);
-}
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset)
-{
- int error;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ rlen, len, record_cnt, data_cnt);
+
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Ensure we include the continuation opheader in the
+ * space we need in the new iclog by adding that size
+ * to the length we require. This continuation opheader
+ * needs to be accounted to the ticket as the space it
+ * consumes hasn't been accounted to the lv we are
+ * writing.
+ */
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset,
+ *len + sizeof(struct xlog_op_header),
+ record_cnt, data_cnt);
+ if (error)
+ return error;
- *partial_copy = 0;
- *partial_copy_len = 0;
+ ophdr = iclog->ic_datap + *log_offset;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
- return 0;
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ *log_offset += sizeof(struct xlog_op_header);
+ *data_cnt += sizeof(struct xlog_op_header);
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- xlog_is_shutdown(log));
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_write_iovec(iclog, log_offset,
+ reg->i_addr + reg_offset,
+ rlen, len, record_cnt, data_cnt);
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+ }
+
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
+ return 0;
}
/*
@@ -2449,27 +2468,16 @@ xlog_write(
struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
- uint optype)
+ uint32_t len)
+
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_vec *lv = log_vector;
- struct xfs_log_iovec *vecp = lv->lv_iovecp;
- int index = 0;
- int len;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
- int record_cnt = 0;
- int data_cnt = 0;
+ uint32_t record_cnt = 0;
+ uint32_t data_cnt = 0;
int error = 0;
+ int log_offset;
- /*
- * If this is a commit or unmount transaction, we don't need a start
- * record to be written. We do, however, have to account for the
- * commit or unmount header that gets written. Hence we always have
- * to account for an extra xlog_op_header here.
- */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2477,144 +2485,54 @@ xlog_write(
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, optype);
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
+ ASSERT(log_offset <= iclog->ic_size - 1);
- /*
- * If we have a context pointer, pass it the first iclog we are
- * writing to so it can record state needed for iclog write
- * ordering.
- */
- if (ctx) {
- xlog_cil_set_ctx_write_state(ctx, iclog);
- ctx = NULL;
- }
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx)
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ while (lv) {
/*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
*/
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int copy_len;
- int copy_off;
- bool ordered = false;
- bool wrote_start_rec = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- /*
- * Before we start formatting log vectors, we need to
- * write a start record. Only do this for the first
- * iclog we write to.
- */
- if (optype & XLOG_START_TRANS) {
- xlog_write_start_rec(ptr, ticket);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
- optype &= ~XLOG_START_TRANS;
- wrote_start_rec = true;
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
- if (!ophdr)
- return -EIO;
-
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
-
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- if (wrote_start_rec) {
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- }
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, optype,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset);
- if (error)
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - log_offset) {
+ error = xlog_write_partial(lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (error) {
+ /*
+ * We have no iclog to release, so just return
+ * the error immediately.
+ */
return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
-
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
}
+ } else {
+ xlog_write_full(lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
}
+ lv = lv->lv_next;
}
-
ASSERT(len == 0);
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
@@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -3008,9 +2925,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3052,13 +2966,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
@@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant(
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
}
xfs_log_ticket_put(ticket);
@@ -3591,7 +3500,6 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
bool permanent)
{
struct xlog_ticket *tic;
@@ -3609,40 +3517,14 @@ xlog_ticket_alloc(
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
- tic->t_clientid = client;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
#if defined(DEBUG)
/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs. This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr)
-{
- int i;
- int good_ptr = 0;
-
- for (i = 0; i < log->l_iclog_bufs; i++) {
- if (ptr >= log->l_iclog_bak[i] &&
- ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
- good_ptr++;
- }
-
- if (!good_ptr)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
-/*
* Check to make sure the grant write head didn't just over lap the tail. If
* the cycles are the same, we can't be overlapping. Otherwise, make sure that
* the cycles differ by exactly one and check the byte count.
@@ -3769,7 +3651,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3780,11 +3662,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;
@@ -3792,8 +3675,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((uintptr_t)&ophead->oh_len -
- (uintptr_t)iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,7 +3711,7 @@ xlog_verify_iclog(
bool
xlog_force_shutdown(
struct xlog *log,
- int shutdown_flags)
+ uint32_t shutdown_flags)
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
@@ -3995,3 +3877,44 @@ xlog_drop_incompat_feat(
{
up_read(&log->l_incompat_users);
}
+
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+int
+xfs_attr_use_log_assist(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /*
+ * Protect ourselves from an idle log clearing the logged xattrs log
+ * incompat feature bit.
+ */
+ xlog_use_incompat_feat(mp->m_log);
+
+ /*
+ * If log-assisted xattrs are already enabled, the caller can use the
+ * log assisted swap functions with the log-incompat reference we got.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return 0;
+
+ /* Enable log-assisted xattrs. */
+ error = xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+ if (error)
+ goto drop_incompat;
+
+ xfs_warn_once(mp,
+"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!");
+
+ return 0;
+drop_incompat:
+ xlog_drop_incompat_feat(mp->m_log);
+ return error;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index dc1b77b92fc1..252b098cde1f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -21,46 +21,59 @@ struct xfs_log_vec {
#define XFS_LOG_VEC_ORDERED (-1)
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
{
- struct xfs_log_iovec *vec = *vecp;
+ return roundup(len, sizeof(uint32_t));
+}
- if (vec) {
- ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
- vec++;
- } else {
- vec = &lv->lv_iovecp[0];
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type);
+
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+ int data_len)
+{
+ struct xlog_op_header *oph = vec->i_addr;
+ int len;
+
+ /*
+ * Always round up the length to the correct alignment so callers don't
+ * need to know anything about this log vec layout requirement. This
+ * means we have to zero the area the data to be written does not cover.
+ * This is complicated by fact the payload region is offset into the
+ * logvec region by the opheader that tracks the payload.
+ */
+ len = xlog_calc_iovec_len(data_len);
+ if (len - data_len != 0) {
+ char *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+ memset(buf + data_len, 0, len - data_len);
}
- vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ /*
+ * The opheader tracks aligned payload length, whilst the logvec tracks
+ * the overall region length.
+ */
+ oph->oh_len = cpu_to_be32(len);
- ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+ len += sizeof(struct xlog_op_header);
+ lv->lv_buf_len += len;
+ lv->lv_bytes += len;
+ vec->i_len = len;
- *vecp = vec;
- return vec->i_addr;
+ /* Catch buffer overruns */
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
}
/*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it. We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
*/
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
- lv->lv_buf_len += round_up(len, sizeof(uint64_t));
- lv->lv_bytes += len;
- vec->i_len = len;
-}
-
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
@@ -117,15 +130,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_reserve(struct xfs_mount *mp,
- int length,
- int count,
- struct xlog_ticket **ticket,
- uint8_t clientid,
- bool permanent);
-int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void xfs_log_unmount(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
+int xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+ struct xlog_ticket **ticket, bool permanent);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void xfs_log_unmount(struct xfs_mount *mp);
bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -140,9 +149,10 @@ void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
-bool xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
void xlog_use_incompat_feat(struct xlog *log);
void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ba57323bfdce..db6cb7800251 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,7 +37,7 @@ xlog_cil_ticket_alloc(
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
+ tic = xlog_ticket_alloc(log, 0, 1, 0);
/*
* set the current reservation to zero so we know to steal the basic
@@ -48,6 +48,38 @@ xlog_cil_ticket_alloc(
}
/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+ struct xfs_cil *cil,
+ struct xfs_log_item *lip)
+{
+ if (list_empty(&lip->li_cil))
+ return false;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
+/*
* Unavoidable forward declaration - xlog_cil_push_work() calls
* xlog_cil_ctx_alloc() itself.
*/
@@ -103,39 +135,6 @@ xlog_cil_iovec_space(
}
/*
- * shadow buffers can be large, so we need to use kvmalloc() here to ensure
- * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
- * back to vmalloc, so we can't actually do anything useful with gfp flags to
- * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
- * direct reclaim and compaction in the slow path, both of which are
- * horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or buddy
- * allocator. Hence we have to open code kvmalloc outselves here.
- *
- * Also, we are in memalloc_nofs_save task context here, so despite the use of
- * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
- * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
- * just all pretend this is a GFP_KERNEL context operation....
- */
-static inline void *
-xlog_cil_kvmalloc(
- size_t buf_size)
-{
- gfp_t flags = GFP_KERNEL;
- void *p;
-
- flags &= ~__GFP_DIRECT_RECLAIM;
- flags |= __GFP_NOWARN | __GFP_NORETRY;
- do {
- p = kmalloc(buf_size, flags);
- if (!p)
- p = vmalloc(buf_size);
- } while (!p);
-
- return p;
-}
-
-/*
* Allocate or pin log vector buffers for CIL insertion.
*
* The CIL currently uses disposable buffers for copying a snapshot of the
@@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs(
}
/*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
+ * We 64-bit align the length of each iovec so that the start of
+ * the next one is naturally aligned. We'll need to account for
+ * that slack space here.
+ *
+ * We also add the xlog_op_header to each region when
+ * formatting, but that's not accounted to the size of the item
+ * at this point. Hence we'll need an addition number of bytes
+ * for each vector to hold an opheader.
+ *
+ * Then round nbytes up to 64-bit alignment so that the initial
+ * buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs * sizeof(uint64_t);
+ nbytes += niovecs *
+ (sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/*
@@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kmem_free(lip->li_lv_shadow);
- lv = xlog_cil_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs(
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
*/
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
struct xfs_log_vec *lv,
struct xfs_log_vec *old_lv,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
- *diff_iovecs += lv->lv_niovecs;
- }
/*
* If there is no old LV, this is the first time we've seen the item in
@@ -309,7 +311,6 @@ xfs_cil_prepare_item(
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
- *diff_iovecs -= old_lv->lv_niovecs;
lv->lv_item->li_lv_shadow = old_lv;
}
@@ -358,12 +359,10 @@ static void
xlog_cil_insert_format_items(
struct xlog *log,
struct xfs_trans *tp,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
struct xfs_log_item *lip;
-
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
@@ -406,7 +405,6 @@ xlog_cil_insert_format_items(
* set the item up as though it is a new insertion so
* that the space reservation accounting is correct.
*/
- *diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
/* Ensure the lv is set up according to ->iop_size */
@@ -431,7 +429,7 @@ xlog_cil_insert_format_items(
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len);
}
}
@@ -445,13 +443,13 @@ insert:
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ uint32_t released_space)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int diff_iovecs = 0;
int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
@@ -461,15 +459,10 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ xlog_cil_insert_format_items(log, tp, &len);
spin_lock(&cil->xc_cil_lock);
- /* account for space used by new iovec headers */
- iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
- len += iovhdr_res;
- ctx->nvecs += diff_iovecs;
-
/* attach the transaction to the CIL if it has any busy extents */
if (!list_empty(&tp->t_busy))
list_splice_init(&tp->t_busy, &ctx->busy_extents);
@@ -500,7 +493,9 @@ xlog_cil_insert_items(
ASSERT(tp->t_ticket->t_curr_res >= len);
}
tp->t_ticket->t_curr_res -= len;
+ tp->t_ticket->t_curr_res += released_space;
ctx->space_used += len;
+ ctx->space_used -= released_space;
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -605,7 +600,7 @@ xlog_discard_busy_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -822,7 +817,8 @@ restart:
static int
xlog_cil_write_chain(
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *chain)
+ struct xfs_log_vec *chain,
+ uint32_t chain_len)
{
struct xlog *log = ctx->cil->xc_log;
int error;
@@ -830,7 +826,7 @@ xlog_cil_write_chain(
error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
if (error)
return error;
- return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+ return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
}
/*
@@ -844,9 +840,14 @@ xlog_cil_write_commit_record(
struct xfs_cil_ctx *ctx)
{
struct xlog *log = ctx->cil->xc_log;
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
.i_type = XLOG_REG_TYPE_COMMIT,
};
struct xfs_log_vec vec = {
@@ -862,12 +863,138 @@ xlog_cil_write_commit_record(
if (error)
return error;
- error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+ /* account for space used by record data */
+ ctx->ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
+struct xlog_cil_trans_hdr {
+ struct xlog_op_header oph[2];
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction. We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_cil_trans_hdr *hdr,
+ struct xfs_log_vec *lvhdr,
+ int num_iovecs)
+{
+ struct xlog_ticket *tic = ctx->ticket;
+ __be32 tid = cpu_to_be32(tic->t_tid);
+
+ memset(hdr, 0, sizeof(*hdr));
+
+ /* Log start record */
+ hdr->oph[0].oh_tid = tid;
+ hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+ /* log iovec region pointer */
+ hdr->lhdr[0].i_addr = &hdr->oph[0];
+ hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+ hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+ /* log opheader */
+ hdr->oph[1].oh_tid = tid;
+ hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+ /* transaction header in host byte order format */
+ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+ hdr->thdr.th_tid = tic->t_tid;
+ hdr->thdr.th_num_items = num_iovecs;
+
+ /* log iovec region pointer */
+ hdr->lhdr[1].i_addr = &hdr->oph[1];
+ hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_trans_header);
+ hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+ lvhdr->lv_niovecs = 2;
+ lvhdr->lv_iovecp = &hdr->lhdr[0];
+ lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+ lvhdr->lv_next = ctx->lv_chain;
+
+ tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx,
+ struct list_head *whiteouts,
+ uint32_t *num_iovecs,
+ uint32_t *num_bytes)
+{
+ struct xfs_log_vec *lv = NULL;
+
+ while (!list_empty(&cil->xc_cil)) {
+ struct xfs_log_item *item;
+
+ item = list_first_entry(&cil->xc_cil,
+ struct xfs_log_item, li_cil);
+
+ if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+ list_move(&item->li_cil, whiteouts);
+ trace_xfs_cil_whiteout_skip(item);
+ continue;
+ }
+
+ list_del_init(&item->li_cil);
+ if (!ctx->lv_chain)
+ ctx->lv_chain = item->li_lv;
+ else
+ lv->lv_next = item->li_lv;
+ lv = item->li_lv;
+ item->li_lv = NULL;
+ *num_iovecs += lv->lv_niovecs;
+
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ *num_bytes += lv->lv_bytes;
+ }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+ struct list_head *whiteouts)
+{
+ while (!list_empty(whiteouts)) {
+ struct xfs_log_item *item = list_first_entry(whiteouts,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ trace_xfs_cil_whiteout_unpin(item);
+ item->li_ops->iop_unpin(item, 1);
+ }
+}
+
/*
* Push the Committed Item List to the log.
*
@@ -890,16 +1017,15 @@ xlog_cil_push_work(
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log;
- struct xfs_log_vec *lv;
struct xfs_cil_ctx *new_ctx;
- struct xlog_ticket *tic;
- int num_iovecs;
+ int num_iovecs = 0;
+ int num_bytes = 0;
int error = 0;
- struct xfs_trans_header thdr;
- struct xfs_log_iovec lhdr;
+ struct xlog_cil_trans_hdr thdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_csn_t push_seq;
bool push_commit_stable;
+ LIST_HEAD (whiteouts);
new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -968,28 +1094,7 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- /*
- * Pull all the log vectors off the items in the CIL, and remove the
- * items from the CIL. We don't need the CIL lock here because it's only
- * needed on the transaction commit side which is currently locked out
- * by the flush lock.
- */
- lv = NULL;
- num_iovecs = 0;
- while (!list_empty(&cil->xc_cil)) {
- struct xfs_log_item *item;
-
- item = list_first_entry(&cil->xc_cil,
- struct xfs_log_item, li_cil);
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
- lv = item->li_lv;
- item->li_lv = NULL;
- num_iovecs += lv->lv_niovecs;
- }
+ xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
* Switch the contexts so we can drop the context lock and move out
@@ -1025,26 +1130,11 @@ xlog_cil_push_work(
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
- *
- * The LSN we need to pass to the log items on transaction commit is
- * the LSN reported by the first log vector write. If we use the commit
- * record lsn then we can move the tail beyond the grant write head.
*/
- tic = ctx->ticket;
- thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
- thdr.th_type = XFS_TRANS_CHECKPOINT;
- thdr.th_tid = tic->t_tid;
- thdr.th_num_items = num_iovecs;
- lhdr.i_addr = &thdr;
- lhdr.i_len = sizeof(xfs_trans_header_t);
- lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
- tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
- lvhdr.lv_niovecs = 1;
- lvhdr.lv_iovecp = &lhdr;
- lvhdr.lv_next = ctx->lv_chain;
-
- error = xlog_cil_write_chain(ctx, &lvhdr);
+ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+ num_bytes += lvhdr.lv_bytes;
+
+ error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
if (error)
goto out_abort_free_ticket;
@@ -1052,7 +1142,7 @@ xlog_cil_push_work(
if (error)
goto out_abort_free_ticket;
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
/*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1107,6 +1197,7 @@ xlog_cil_push_work(
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xlog_cil_cleanup_whiteouts(&whiteouts);
return;
out_skip:
@@ -1116,8 +1207,9 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
ASSERT(xlog_is_shutdown(log));
+ xlog_cil_cleanup_whiteouts(&whiteouts);
if (!ctx->commit_iclog) {
xlog_cil_committed(ctx);
return;
@@ -1267,6 +1359,43 @@ xlog_cil_empty(
}
/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+ struct xfs_cil *cil,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *ilip, *next;
+ uint32_t len = 0;
+
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+ continue;
+
+ ilip = lip->li_ops->iop_intent(lip);
+ if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+ continue;
+ set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+ trace_xfs_cil_whiteout_mark(ilip);
+ len += ilip->li_lv->lv_bytes;
+ kmem_free(ilip->li_lv);
+ ilip->li_lv = NULL;
+
+ xfs_trans_del_item(lip);
+ lip->li_ops->iop_release(lip);
+ }
+ return len;
+}
+
+/*
* Commit a transaction with the given vector to the Committed Item List.
*
* To do this, we need to format the item, pin it in memory if required and
@@ -1288,6 +1417,7 @@ xlog_cil_commit(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
+ uint32_t released_space = 0;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1299,7 +1429,10 @@ xlog_cil_commit(
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, tp);
+ if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+ released_space = xlog_cil_process_intents(cil, tp);
+
+ xlog_cil_insert_items(log, tp, released_space);
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1456,32 +1589,6 @@ out_shutdown:
}
/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
- *
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
- */
-bool
-xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
-{
- struct xfs_cil *cil = lip->li_log->l_cilp;
-
- if (list_empty(&lip->li_cil))
- return false;
-
- /*
- * li_seq is written on the first commit of a log item to record the
- * first checkpoint it is written to. Hence if it is different to the
- * current sequence, we're in a new checkpoint.
- */
- return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 401cdc400980..67fd9789e69a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,8 +51,8 @@ enum xlog_iclog_state {
/*
* In core log flags
*/
-#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */
#define XLOG_ICL_STRINGS \
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
@@ -62,7 +62,7 @@ enum xlog_iclog_state {
/*
* Log ticket flags
*/
-#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
@@ -142,19 +142,6 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-/* Ticket reservation region accounting */
-#define XLOG_TIC_LEN_MAX 15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
- uint r_len; /* region length :4 */
- uint r_type; /* region's transaction type :4 */
-} xlog_res_t;
-
typedef struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
@@ -164,15 +151,7 @@ typedef struct xlog_ticket {
int t_unit_res; /* unit reservation in bytes : 4 */
char t_ocnt; /* original count : 1 */
char t_cnt; /* current count : 1 */
- char t_clientid; /* who does this belong to; : 1 */
- char t_flags; /* properties of reservation : 1 */
-
- /* reservation array fields */
- uint t_res_num; /* num in array : 4 */
- uint t_res_num_ophdrs; /* num op hdrs : 4 */
- uint t_res_arr_sum; /* array sum : 4 */
- uint t_res_o_flow; /* sum overflow : 4 */
- xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+ uint8_t t_flags; /* properties of reservation : 1 */
} xlog_ticket_t;
/*
@@ -211,7 +190,7 @@ typedef struct xlog_in_core {
u32 ic_offset;
enum xlog_iclog_state ic_state;
unsigned int ic_flags;
- char *ic_datap; /* pointer to iclog data */
+ void *ic_datap; /* pointer to iclog data */
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -242,7 +221,6 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int nvecs; /* number of regions */
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
@@ -441,10 +419,6 @@ struct xlog {
struct xfs_kobj l_kobj;
- /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
- void *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -509,27 +483,14 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
extern struct kmem_cache *xfs_log_ticket_cache;
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int count,
- char client,
- bool permanent);
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
- *ptr += bytes;
- *len -= bytes;
- *off += bytes;
-}
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+ int count, bool permanent);
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
- uint optype);
+ uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
@@ -690,4 +651,38 @@ xlog_valid_lsn(
return valid;
}
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c4ad4296c540..97b941c07957 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1800,6 +1800,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_cud_item_ops,
&xlog_bui_item_ops,
&xlog_bud_item_ops,
+ &xlog_attri_item_ops,
+ &xlog_attrd_item_ops,
};
static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bc66d95c8d4c..8f495cc23903 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-#define define_xfs_printk_level(func, kern_level) \
-void func(const struct xfs_mount *mp, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int level; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- __xfs_printk(kern_level, mp, &vaf); \
- va_end(args); \
- \
- if (!kstrtoint(kern_level, 0, &level) && \
- level <= LOGLEVEL_ERR && \
- xfs_error_level >= XFS_ERRLEVEL_HIGH) \
- xfs_stack_trace(); \
-} \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+ const char *kern_level,
+ const struct xfs_mount *mp,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(kern_level, mp, &vaf);
+
+ va_end(args);
+
+ if (!kstrtoint(kern_level, 0, &level) &&
+ level <= LOGLEVEL_ERR &&
+ xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
void
-xfs_alert_tag(
+_xfs_alert_tag(
const struct xfs_mount *mp,
- int panic_tag,
+ uint32_t panic_tag,
const char *fmt, ...)
{
struct va_format vaf;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index bb9860ec9a93..55ee464ab59f 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -6,33 +6,46 @@
struct xfs_mount;
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+ const char *fmt, ...);
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+ xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \
+})
+#define xfs_emerg(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
#ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
#else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
#endif
+#define xfs_alert_tag(mp, tag, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+ _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+ const char *fmt, ...);
+
#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c5f153c3693f..0c0bcbd4949d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -468,6 +468,8 @@ STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
+ int error = 0;
+
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
@@ -506,11 +508,32 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
- !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
- return 0;
+ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+ xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+ error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ if (error)
+ return error;
+ }
- return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ /*
+ * Older kernels misused sb_frextents to reflect both incore
+ * reservations made by running transactions and the actual count of
+ * free rt extents in the ondisk metadata. Transactions committed
+ * during runtime can therefore contain a superblock update that
+ * undercounts the number of free rt extents tracked in the rt bitmap.
+ * A clean unmount record will have the correct frextents value since
+ * there can be no other transactions running at that point.
+ *
+ * If we're mounting the rt volume after recovering the log, recompute
+ * frextents from the rtbitmap file to fix the inconsistency.
+ */
+ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ error = xfs_rtalloc_reinit_frextents(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
/*
@@ -784,11 +807,6 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
- /* Make sure the summary counts are ok. */
- error = xfs_check_summary_counts(mp);
- if (error)
- goto out_log_dealloc;
-
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -844,6 +862,11 @@ xfs_mountfs(
goto out_rele_rip;
}
+ /* Make sure the summary counts are ok. */
+ error = xfs_check_summary_counts(mp);
+ if (error)
+ goto out_rtunmount;
+
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
@@ -1087,24 +1110,33 @@ xfs_fs_writable(
return true;
}
+/* Adjust m_fdblocks or m_frextents. */
int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
struct xfs_mount *mp,
+ struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
+ uint64_t set_aside = 0;
s32 batch;
- uint64_t set_aside;
+ bool has_resv_pool;
+
+ ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+ has_resv_pool = (counter == &mp->m_fdblocks);
+ if (rsvd)
+ ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
- if (likely(mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(&mp->m_fdblocks, delta);
+ if (likely(!has_resv_pool ||
+ mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(counter, delta);
return 0;
}
@@ -1116,7 +1148,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@@ -1130,7 +1162,7 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1147,9 +1179,10 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ if (has_resv_pool)
+ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(counter, delta, batch);
+ if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@@ -1160,8 +1193,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(&mp->m_fdblocks, -delta);
- if (!rsvd)
+ percpu_counter_add(counter, -delta);
+ if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1178,24 +1211,6 @@ fdblocks_enospc:
return -ENOSPC;
}
-int
-xfs_mod_frextents(
- struct xfs_mount *mp,
- int64_t delta)
-{
- int64_t lcounter;
- int ret = 0;
-
- spin_lock(&mp->m_sb_lock);
- lcounter = mp->m_sb.sb_frextents + delta;
- if (lcounter < 0)
- ret = -ENOSPC;
- else
- mp->m_sb.sb_frextents = lcounter;
- spin_unlock(&mp->m_sb_lock);
- return ret;
-}
-
/*
* Used to free the superblock along various error paths.
*/
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f6dc19de8322..8c42786e4942 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ struct percpu_counter m_frextents; /* free rt extent counter */
+
/*
* Count of data device blocks reserved for delayed allocations,
* including indlen blocks. Does not include allocated CoW staging
@@ -276,6 +278,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
/*
* Mount features
@@ -425,16 +429,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define xfs_is_shutdown(mp) xfs_is_shutdown(mp)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
#define XFS_SHUTDOWN_STRINGS \
{ SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
@@ -494,9 +497,20 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
- bool reserved);
-extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+ return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+ return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 25991923c1a8..758702b9495f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f165d1a3de1d..8fc813cb6011 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -582,9 +582,6 @@ xfs_qm_init_timelimits(
defq->blk.time = XFS_QM_BTIMELIMIT;
defq->ino.time = XFS_QM_ITIMELIMIT;
defq->rtb.time = XFS_QM_RTBTIMELIMIT;
- defq->blk.warn = XFS_QM_BWARNLIMIT;
- defq->ino.warn = XFS_QM_IWARNLIMIT;
- defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -608,12 +605,6 @@ xfs_qm_init_timelimits(
defq->ino.time = dqp->q_ino.timer;
if (dqp->q_rtb.timer)
defq->rtb.time = dqp->q_rtb.timer;
- if (dqp->q_blk.warnings)
- defq->blk.warn = dqp->q_blk.warnings;
- if (dqp->q_ino.warnings)
- defq->ino.warn = dqp->q_ino.warnings;
- if (dqp->q_rtb.warnings)
- defq->rtb.warn = dqp->q_rtb.warnings;
xfs_qm_dqdestroy(dqp);
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5bb12717ea28..9683f0457d19 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -34,7 +34,6 @@ struct xfs_quota_limits {
xfs_qcnt_t hard; /* default hard limit */
xfs_qcnt_t soft; /* default soft limit */
time64_t time; /* limit for timers */
- xfs_qwarncnt_t warn; /* limit for warnings */
};
/* Defaults for each quota type: time limits, warn limits, usage limits */
@@ -134,10 +133,6 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 7d5a31827681..74ac9ca9e119 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -217,8 +217,7 @@ xfs_qm_scall_quotaon(
return 0;
}
-#define XFS_QC_MASK \
- (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
/*
* Adjust limits of this quota, and the defaults if passed in. Returns true
@@ -251,17 +250,6 @@ xfs_setqlim_limits(
}
static inline void
-xfs_setqlim_warns(
- struct xfs_dquot_res *res,
- struct xfs_quota_limits *qlim,
- int warns)
-{
- res->warnings = warns;
- if (qlim)
- qlim->warn = warns;
-}
-
-static inline void
xfs_setqlim_timer(
struct xfs_mount *mp,
struct xfs_dquot_res *res,
@@ -354,8 +342,6 @@ xfs_qm_scall_setqlim(
if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
@@ -370,8 +356,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->rtb : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
@@ -386,8 +370,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->ino : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
- if (newlim->d_fieldmask & QC_INO_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
@@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc(
dst->d_ino_count = dqp->q_ino.reserved;
dst->d_spc_timer = dqp->q_blk.timer;
dst->d_ino_timer = dqp->q_ino.timer;
- dst->d_ino_warns = dqp->q_ino.warnings;
- dst->d_spc_warns = dqp->q_blk.warnings;
+ dst->d_ino_warns = 0;
+ dst->d_spc_warns = 0;
dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
dst->d_rt_spc_timer = dqp->q_rtb.timer;
- dst->d_rt_spc_warns = dqp->q_rtb.warnings;
+ dst->d_rt_spc_warns = 0;
/*
* Internally, we don't reset all the timers when quota enforcement
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 07989bd67728..9c162e69976b 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -40,9 +40,9 @@ xfs_qm_fill_state(
tstate->spc_timelimit = (u32)defq->blk.time;
tstate->ino_timelimit = (u32)defq->ino.time;
tstate->rt_spc_timelimit = (u32)defq->rtb.time;
- tstate->spc_warnlimit = defq->blk.warn;
- tstate->ino_warnlimit = defq->ino.warn;
- tstate->rt_spc_warnlimit = defq->rtb.warn;
+ tstate->spc_warnlimit = 0;
+ tstate->ino_warnlimit = 0;
+ tstate->rt_spc_warnlimit = 0;
if (tempqip)
xfs_irele(ip);
}
@@ -98,7 +98,7 @@ xfs_quota_type(int type)
}
}
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
/*
* Adjust quota timers & warnings
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0d868c93144d..7e97bf19793d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -53,10 +54,11 @@ xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
+ if (!atomic_dec_and_test(&cuip->cui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&cuip->cui_item, 0);
+ xfs_cui_item_free(cuip);
}
@@ -204,14 +206,24 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
+static struct xfs_log_item *
+xfs_cud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
.iop_release = xfs_cud_item_release,
+ .iop_intent = xfs_cud_item_intent,
};
static struct xfs_cud_log_item *
@@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
* 1.) releases the CUI and frees the CUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
@@ -600,6 +612,7 @@ xfs_cui_item_relog(
}
static const struct xfs_item_ops xfs_cui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 54e68e5693fd..e7a7c00d93be 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -586,21 +586,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
@@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_written_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -712,7 +739,7 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
@@ -744,7 +771,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent(
++iext_delta;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto out_cancel;
@@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent(
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
- xfs_filblks_t len = smap.br_blockcount;
+ int done;
/*
* If the extent we're unmapping is a delalloc reservation,
@@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent(
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
- error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
- ASSERT(len == 0);
+ ASSERT(done);
}
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a22b2d19ef91..fef92e02f3bb 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -53,10 +54,11 @@ xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
+ if (!atomic_dec_and_test(&ruip->rui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&ruip->rui_item, 0);
+ xfs_rui_item_free(ruip);
}
STATIC void
@@ -227,14 +229,24 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
+static struct xfs_log_item *
+xfs_rud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_rud_item_size,
.iop_format = xfs_rud_item_format,
.iop_release = xfs_rud_item_release,
+ .iop_intent = xfs_rud_item_intent,
};
static struct xfs_rud_log_item *
@@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update(
* 1.) releases the RUI and frees the RUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
@@ -630,6 +642,7 @@ xfs_rui_item_relog(
}
static const struct xfs_item_ops xfs_rui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b8c79ee791af..292d5e54a92c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -1284,6 +1287,44 @@ xfs_rtmount_init(
return 0;
}
+static int
+xfs_rtalloc_count_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ uint64_t *valp = priv;
+
+ *valp += rec->ar_extcount;
+ return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+ struct xfs_mount *mp)
+{
+ uint64_t val = 0;
+ int error;
+
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+ &val);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_frextents = val;
+ spin_unlock(&mp->m_sb_lock);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ return 0;
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 91b00289509b..62c7ad79cbb6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,6 +22,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
# define xfs_verify_rtbno(m, r) (false)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 54be9d64093e..8495ef076ffc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -843,9 +843,11 @@ xfs_fs_statfs(
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ s64 freertx;
+
statp->f_blocks = sbp->sb_rblocks;
- statp->f_bavail = statp->f_bfree =
- sbp->sb_frextents * sbp->sb_rextsize;
+ freertx = percpu_counter_sum_positive(&mp->m_frextents);
+ statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc;
+
return 0;
+free_delalloc:
+ percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
+ percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1608,14 +1618,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
- if (xfs_has_discard(mp)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- xfs_warn(mp, "mounting with \"discard\" option, but "
- "the device does not support discard");
- mp->m_features &= ~XFS_FEAT_DISCARD;
- }
+ if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
+ xfs_warn(mp,
+ "mounting with \"discard\" option, but the device does not support discard");
+ mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_reflink(mp)) {
@@ -1639,6 +1645,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_large_extent_counts(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index affbedf78160..4145ba872547 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -226,11 +226,6 @@ xfs_symlink(
goto out_trans_cancel;
}
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* Allocate an inode for the symlink.
*/
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 7692e76ead33..f78ad6b10ea5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -83,6 +83,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
#ifdef DEBUG
int pwork_threads; /* parallel workqueue threads */
+ bool larp; /* log attribute replay */
#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 574b80c29fe1..f7faf6e70d7f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,29 @@ pwork_threads_show(
return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.larp);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+STATIC ssize_t
+larp_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
static struct attribute *xfs_dbg_attrs[] = {
@@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(always_cow),
#ifdef DEBUG
ATTR_LIST(pwork_threads),
+ ATTR_LIST(larp),
#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b141ef78c755..d32026585c1b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__field(unsigned, lockval)
__field(unsigned, flags)
__field(unsigned long, caller_ip)
+ __field(const void *, buf_ops)
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
@@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
+ __entry->buf_ops = bp->b_ops;
),
TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pS",
+ "lock %d flags %s bufops %pS caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ __entry->buf_ops,
(void *)__entry->caller_ip)
)
@@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
-#define XFS_QMOPT_FLAGS \
- { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
- { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
- { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
- { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
- { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
- { XFS_QMOPT_INHERIT, "INHERIT" }, \
- { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
- { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
- { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
- { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
- { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
- { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
- { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
- { XFS_QMOPT_RES_INOS, "RES_INOS" }
-
TRACE_EVENT(xfs_trans_mod_dquot,
TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
unsigned int field, int64_t delta),
@@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(int, namelen)
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
__field(unsigned int, attr_flags)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, idx)
),
TP_fast_assign(
@@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, src_idx)
__field(int, dst_idx)
__field(int, count)
@@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
- __field(int, nex)
+ __field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
- TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
@@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+ struct xfs_trans_res *res), \
+ TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+ TP_ARGS(mp, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint, logres)
+ __field(int, logcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logres = res->tr_logres;
+ __entry->logcount = res->tr_logcount;
+ ),
+ TP_printk("dev %d:%d logres %u logcount %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logres,
+ __entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
DECLARE_EVENT_CLASS(xfs_das_state_class,
TP_PROTO(int das, struct xfs_inode *ip),
TP_ARGS(das, ip),
@@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class,
__entry->das = das;
__entry->ino = ip->i_ino;
),
- TP_printk("state change %d ino 0x%llx",
- __entry->das, __entry->ino)
+ TP_printk("state change %s ino 0x%llx",
+ __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+ __entry->ino)
)
#define DEFINE_DAS_STATE_EVENT(name) \
@@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \
TP_ARGS(das, ip))
DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
TRACE_EVENT(xfs_force_shutdown,
TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0ac717aad380..82cf0189c0db 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -32,7 +32,6 @@ static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
- struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
- xfs_log_get_max_trans_res(mp, &resv);
- trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
@@ -194,11 +191,9 @@ xfs_trans_reserve(
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(mp,
- resp->tr_logres,
+ error = xfs_log_reserve(mp, resp->tr_logres,
resp->tr_logcount,
- &tp->t_ticket, XFS_TRANSACTION,
- permanent);
+ &tp->t_ticket, permanent);
}
if (error)
@@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
- if (tp->t_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
- if (tp->t_res_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+ /*
+ * Updating frextents requires careful handling because it does not
+ * behave like the lazysb counters because we cannot rely on log
+ * recovery in older kenels to recompute the value from the rtbitmap.
+ * This means that the ondisk frextents must be consistent with the
+ * rtbitmap.
+ *
+ * Therefore, log the frextents change to the ondisk superblock and
+ * update the incore superblock so that future calls to xfs_log_sb
+ * write the correct value ondisk.
+ *
+ * Don't touch m_frextents because it includes incore reservations,
+ * and those are handled by the unreserve function.
+ */
+ if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ struct xfs_mount *mp = tp->t_mountp;
+ int64_t rtxdelta;
+
+ rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+ spin_lock(&mp->m_sb_lock);
+ be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+ mp->m_sb.sb_frextents += rtxdelta;
+ spin_unlock(&mp->m_sb_lock);
+ }
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ if (rtxdelta) {
+ error = xfs_mod_frextents(mp, rtxdelta);
+ ASSERT(!error);
+ }
+
+ if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
@@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
- mp->m_sb.sb_frextents += rtxdelta;
+ /*
+ * Do not touch sb_frextents here because we are dealing with incore
+ * reservation. sb_frextents is not part of the lazy sb counters so it
+ * must be consistent with the ondisk rtbitmap and must never include
+ * incore reservations.
+ */
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0c82673238f4..9561f193e7e1 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,13 +55,15 @@ struct xfs_log_item {
#define XFS_LI_IN_AIL 0
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
-#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
+#define XFS_LI_DIRTY 3
+#define XFS_LI_WHITEOUT 4
#define XFS_LI_FLAGS \
- { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
- { (1 << XFS_LI_ABORTED), "ABORTED" }, \
- { (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }
+ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1u << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1u << XFS_LI_FAILED), "FAILED" }, \
+ { (1u << XFS_LI_DIRTY), "DIRTY" }, \
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
struct xfs_item_ops {
unsigned flags;
@@ -78,30 +80,32 @@ struct xfs_item_ops {
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
struct xfs_trans *tp);
+ struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
-/* Is this log item a deferred action intent? */
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+#define XFS_ITEM_INTENT (1 << 1)
+#define XFS_ITEM_INTENT_DONE (1 << 2)
+
static inline bool
xlog_item_is_intent(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT;
}
-/* Is this a log intent-done item? */
static inline bool
xlog_item_is_intent_done(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_unpin == NULL &&
- lip->li_ops->iop_push == NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
}
-/*
- * Release the log item as soon as committed. This is for items just logging
- * intents that never need to be written back in place.
- */
-#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
-
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 9ba7e6b9bed3..aa00cf67ad72 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -597,13 +597,11 @@ xfs_dqresv_check(
if (softlimit && total_count > softlimit) {
time64_t now = ktime_get_real_seconds();
- if ((res->timer != 0 && now > res->timer) ||
- (res->warnings != 0 && res->warnings >= qlim->warn)) {
+ if (res->timer != 0 && now > res->timer) {
*fatal = true;
return QUOTA_NL_ISOFTLONGWARN;
}
- res->warnings++;
return QUOTA_NL_ISOFTWARN;
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0d050f8829ef..7a044afd4c46 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,9 +12,9 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
-#include "xfs_da_btree.h"
#include <linux/posix_acl_xattr.h>
diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
index 33c1a4f1132e..9fe54f5319f2 100644
--- a/fs/zonefs/Makefile
+++ b/fs/zonefs/Makefile
@@ -3,4 +3,4 @@ ccflags-y += -I$(src)
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
-zonefs-y := super.o
+zonefs-y := super.o sysfs.o
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index e20e7c841489..bcb21aea990a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -27,6 +27,39 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
+/*
+ * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ */
+static void zonefs_account_active(struct inode *inode)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ lockdep_assert_held(&zi->i_truncate_mutex);
+
+ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ return;
+
+ /*
+ * If the zone is active, that is, if it is explicitly open or
+ * partially written, check if it was already accounted as active.
+ */
+ if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
+ (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
+ if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
+ zi->i_flags |= ZONEFS_ZONE_ACTIVE;
+ atomic_inc(&sbi->s_active_seq_files);
+ }
+ return;
+ }
+
+ /* The zone is not active. If it was, update the active count */
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
+ zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
+ atomic_dec(&sbi->s_active_seq_files);
+ }
+}
+
static inline int zonefs_zone_mgmt(struct inode *inode,
enum req_opf op)
{
@@ -68,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
* A full zone is no longer open/active and does not need
* explicit closing.
*/
- if (isize >= zi->i_max_size)
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ if (isize >= zi->i_max_size) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
+ atomic_dec(&sbi->s_active_seq_files);
+ zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
+ }
}
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -124,9 +162,9 @@ static const struct iomap_ops zonefs_iomap_ops = {
.iomap_begin = zonefs_iomap_begin,
};
-static int zonefs_readpage(struct file *unused, struct page *page)
+static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_readpage(page, &zonefs_iomap_ops);
+ return iomap_read_folio(folio, &zonefs_iomap_ops);
}
static void zonefs_readahead(struct readahead_control *rac)
@@ -192,12 +230,12 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
}
static const struct address_space_operations zonefs_file_aops = {
- .readpage = zonefs_readpage,
+ .read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.migratepage = iomap_migrate_page,
.is_partially_uptodate = iomap_is_partially_uptodate,
@@ -397,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_update_stats(inode, data_size);
zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
+ zonefs_account_active(inode);
return 0;
}
@@ -508,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
zonefs_update_stats(inode, isize);
truncate_setsize(inode, isize);
zi->i_wpoffset = isize;
+ zonefs_account_active(inode);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
@@ -689,13 +729,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int max;
+ unsigned int max = bdev_max_zone_append_sectors(bdev);
struct bio *bio;
ssize_t size;
int nr_pages;
ssize_t ret;
- max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
@@ -861,13 +900,20 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0, 0);
+ &zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
count = ret;
+
+ /*
+ * Update the zone write pointer offset assuming the write
+ * operation succeeded. If it did not, the error recovery path
+ * will correct it. Also do active seq file accounting.
+ */
mutex_lock(&zi->i_truncate_mutex);
zi->i_wpoffset += count;
+ zonefs_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -996,7 +1042,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0, 0);
+ &zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -1009,13 +1055,13 @@ inode_unlock:
return ret;
}
-static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+/*
+ * Write open accounting is done only for sequential files.
+ */
+static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ struct file *file)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
-
- if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
- return false;
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
return false;
@@ -1026,28 +1072,34 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi
return true;
}
-static int zonefs_open_zone(struct inode *inode)
+static int zonefs_seq_file_write_open(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
if (!zi->i_wr_refcnt) {
- if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
- atomic_dec(&sbi->s_open_zones);
- ret = -EBUSY;
- goto unlock;
- }
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
- if (i_size_read(inode) < zi->i_max_size) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- if (ret) {
- atomic_dec(&sbi->s_open_zones);
+ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+
+ if (wro > sbi->s_max_wro_seq_files) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ ret = -EBUSY;
goto unlock;
}
- zi->i_flags |= ZONEFS_ZONE_OPEN;
+
+ if (i_size_read(inode) < zi->i_max_size) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ if (ret) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ goto unlock;
+ }
+ zi->i_flags |= ZONEFS_ZONE_OPEN;
+ zonefs_account_active(inode);
+ }
}
}
@@ -1067,30 +1119,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- if (zonefs_file_use_exp_open(inode, file))
- return zonefs_open_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ return zonefs_seq_file_write_open(inode);
return 0;
}
-static void zonefs_close_zone(struct inode *inode)
+static void zonefs_seq_file_write_close(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
- zi->i_wr_refcnt--;
- if (!zi->i_wr_refcnt) {
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- struct super_block *sb = inode->i_sb;
- /*
- * If the file zone is full, it is not open anymore and we only
- * need to decrement the open count.
- */
- if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
- goto dec;
+ zi->i_wr_refcnt--;
+ if (zi->i_wr_refcnt)
+ goto unlock;
+ /*
+ * The file zone may not be open anymore (e.g. the file was truncated to
+ * its maximum size or it was fully written). For this case, we only
+ * need to decrement the write open count.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
if (ret) {
__zonefs_io_error(inode, false);
@@ -1102,14 +1155,23 @@ static void zonefs_close_zone(struct inode *inode)
*/
if (zi->i_flags & ZONEFS_ZONE_OPEN &&
!(sb->s_flags & SB_RDONLY)) {
- zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ zonefs_warn(sb,
+ "closing zone at %llu failed %d\n",
+ zi->i_zsector, ret);
+ zonefs_warn(sb,
+ "remounting filesystem read-only\n");
sb->s_flags |= SB_RDONLY;
}
+ goto unlock;
}
+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
-dec:
- atomic_dec(&sbi->s_open_zones);
+ zonefs_account_active(inode);
}
+
+ atomic_dec(&sbi->s_wro_seq_files);
+
+unlock:
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -1121,8 +1183,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
* the zone has gone offline or read-only). Make sure we don't fail the
* close(2) for user-space.
*/
- if (zonefs_file_use_exp_open(inode, file))
- zonefs_close_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ zonefs_seq_file_write_close(inode);
return 0;
}
@@ -1337,6 +1399,8 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+ mutex_lock(&zi->i_truncate_mutex);
+
/*
* For sequential zones, make sure that any open zone is closed first
* to ensure that the initial number of open zones is 0, in sync with
@@ -1346,11 +1410,16 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
if (type == ZONEFS_ZTYPE_SEQ &&
(zone->cond == BLK_ZONE_COND_IMP_OPEN ||
zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
- mutex_lock(&zi->i_truncate_mutex);
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
- mutex_unlock(&zi->i_truncate_mutex);
+ if (ret)
+ goto unlock;
}
+ zonefs_account_active(inode);
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
return ret;
}
@@ -1688,14 +1757,18 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
- sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
- atomic_set(&sbi->s_open_zones, 0);
- if (!sbi->s_max_open_zones &&
+
+ atomic_set(&sbi->s_wro_seq_files, 0);
+ sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
+ if (!sbi->s_max_wro_seq_files &&
sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
}
+ atomic_set(&sbi->s_active_seq_files, 0);
+ sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
+
ret = zonefs_read_super(sb);
if (ret)
return ret;
@@ -1710,6 +1783,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
goto cleanup;
+ ret = zonefs_sysfs_register(sb);
+ if (ret)
+ goto cleanup;
+
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
@@ -1755,6 +1832,8 @@ static void zonefs_kill_super(struct super_block *sb)
if (sb->s_root)
d_genocide(sb->s_root);
+
+ zonefs_sysfs_unregister(sb);
kill_block_super(sb);
kfree(sbi);
}
@@ -1802,16 +1881,26 @@ static int __init zonefs_init(void)
return ret;
ret = register_filesystem(&zonefs_type);
- if (ret) {
- zonefs_destroy_inodecache();
- return ret;
- }
+ if (ret)
+ goto destroy_inodecache;
+
+ ret = zonefs_sysfs_init();
+ if (ret)
+ goto unregister_fs;
return 0;
+
+unregister_fs:
+ unregister_filesystem(&zonefs_type);
+destroy_inodecache:
+ zonefs_destroy_inodecache();
+
+ return ret;
}
static void __exit zonefs_exit(void)
{
+ zonefs_sysfs_exit();
zonefs_destroy_inodecache();
unregister_filesystem(&zonefs_type);
}
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
new file mode 100644
index 000000000000..9cb6755ce39a
--- /dev/null
+++ b/fs/zonefs/sysfs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Simple file system for zoned block devices exposing zones as files.
+ *
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+
+#include "zonefs.h"
+
+struct zonefs_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf);
+};
+
+static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr)
+{
+ return container_of(attr, struct zonefs_sysfs_attr, attr);
+}
+
+#define ZONEFS_SYSFS_ATTR_RO(name) \
+static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr
+
+static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+ struct zonefs_sysfs_attr *zonefs_attr =
+ container_of(attr, struct zonefs_sysfs_attr, attr);
+
+ if (!zonefs_attr->show)
+ return 0;
+
+ return zonefs_attr->show(sbi, buf);
+}
+
+static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files);
+
+static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files);
+
+static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_active_seq_files);
+
+static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files);
+
+static struct attribute *zonefs_sysfs_attrs[] = {
+ ATTR_LIST(max_wro_seq_files),
+ ATTR_LIST(nr_wro_seq_files),
+ ATTR_LIST(max_active_seq_files),
+ ATTR_LIST(nr_active_seq_files),
+ NULL,
+};
+ATTRIBUTE_GROUPS(zonefs_sysfs);
+
+static void zonefs_sysfs_sb_release(struct kobject *kobj)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops zonefs_sysfs_attr_ops = {
+ .show = zonefs_sysfs_attr_show,
+};
+
+static struct kobj_type zonefs_sb_ktype = {
+ .default_groups = zonefs_sysfs_groups,
+ .sysfs_ops = &zonefs_sysfs_attr_ops,
+ .release = zonefs_sysfs_sb_release,
+};
+
+static struct kobject *zonefs_sysfs_root;
+
+int zonefs_sysfs_register(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ int ret;
+
+ init_completion(&sbi->s_kobj_unregister);
+ ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype,
+ zonefs_sysfs_root, "%s", sb->s_id);
+ if (ret) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return ret;
+ }
+
+ sbi->s_sysfs_registered = true;
+
+ return 0;
+}
+
+void zonefs_sysfs_unregister(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+
+ if (!sbi || !sbi->s_sysfs_registered)
+ return;
+
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+}
+
+int __init zonefs_sysfs_init(void)
+{
+ zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj);
+ if (!zonefs_sysfs_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void zonefs_sysfs_exit(void)
+{
+ kobject_put(zonefs_sysfs_root);
+ zonefs_sysfs_root = NULL;
+}
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 7b147907c328..4b3de66c3233 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -12,6 +12,7 @@
#include <linux/uuid.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
+#include <linux/kobject.h>
/*
* Maximum length of file names: this only needs to be large enough to fit
@@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
}
#define ZONEFS_ZONE_OPEN (1 << 0)
+#define ZONEFS_ZONE_ACTIVE (1 << 1)
/*
* In-memory inode data.
@@ -182,8 +184,15 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
- unsigned int s_max_open_zones;
- atomic_t s_open_zones;
+ unsigned int s_max_wro_seq_files;
+ atomic_t s_wro_seq_files;
+
+ unsigned int s_max_active_seq_files;
+ atomic_t s_active_seq_files;
+
+ bool s_sysfs_registered;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
@@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
#define zonefs_warn(sb, format, args...) \
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
+int zonefs_sysfs_register(struct super_block *sb);
+void zonefs_sysfs_unregister(struct super_block *sb);
+int zonefs_sysfs_init(void);
+void zonefs_sysfs_exit(void);
+
#endif